예제 #1
0
    def create_symbol_forecast_model(self):
        # Create a lagged series of the S&P500 US stock market index
        snpret = create_lagged_series(
            self.symbol_list[0], self.model_start_date,
            self.model_end_date, lags=5
        )

        # Use the prior two days of returns as predictor
        # values, with direction as the response
        x = snpret[["Lag1", "Lag2"]]
        y = snpret["Direction"]

        # Create training and test sets, each of them is series
        start_test = self.model_start_test_date
        x_train = x[x.index < start_test]
        x_test = x[x.index >= start_test]
        y_train = y[y.index < start_test]
        y_test = y[y.index >= start_test]

        model = QuadraticDiscriminantAnalysis()
        model.fit(x_train, y_train)

        # return nd array
        pred_test = model.predict(x_test)

        print("Error Rate is {0}".format((y_test != pred_test).sum() * 1. / len(y_test)))

        return model
예제 #2
0
    def train(self):
        if self._model_selection == "svm":
            # selected the svc in svm
            self._classifier = svm.SVC()
        elif self._model_selection == "nb":
            self._classifier = GaussianNB()
        elif self._model_selection == "knn":
            # parameter n_jobs can be set to -1 to enable parallel calculating
            self._classifier = KNeighborsClassifier(n_neighbors=7)
        elif self._model_selection == "ada":
            # Bunch of parameters, n_estimators, learning_rate
            self._classifier = AdaBoostClassifier()
        elif self._model_selection == "rf":
            # many parameters including n_jobs
            self._classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
        elif self._model_selection == "qda":
            # complicated array like parameters, perhaps leave it default
            self._classifier = QuadraticDiscriminantAnalysis()
        else:
            print "Please refer to one classifier"

        self._classifier.fit(self._train_data, self._train_targets)
        # predict on valid data
        prediction_valid = self._classifier.predict(self._valid_data)
        # print validation result for selected model.
        print (
            "Classification report for classifier %s on valid_data:\n%s\n"
            % (self._model_selection, metrics.classification_report(self._valid_targets, prediction_valid))
        )
예제 #3
0
class QuadraticDiscriminantAnalysiscls(object):
    """docstring for ClassName"""
    def __init__(self):
        self.qda_cls = QuadraticDiscriminantAnalysis()
        self.prediction = None
        self.train_x = None
        self.train_y = None

    def train_model(self, train_x, train_y):
        try:
            self.train_x = train_x
            self.train_y = train_y
            self.qda_cls.fit(train_x, train_y)
        except:
            print(traceback.format_exc())

    def predict(self, test_x):
        try:
            self.test_x = test_x
            self.prediction = self.qda_cls.predict(test_x)
            return self.prediction
        except:
            print(traceback.format_exc())

    def accuracy_score(self, test_y):
        try:
            # return r2_score(test_y, self.prediction)
            return self.qda_cls.score(self.test_x, test_y)
        except:
            print(traceback.format_exc())
예제 #4
0
파일: forecaster.py 프로젝트: Vegeb/strats
class SNPForecastingStrategy(Strategy):
    """    
    Requires:
    symbol - A stock symbol on which to form a strategy on.
    bars - A DataFrame of bars for the above symbol."""

    def __init__(self, symbol, bars):
        self.symbol = symbol
        self.bars = bars
        self.create_periods()
        self.fit_model()

    def create_periods(self):
        """Create training/test periods."""
        self.start_train = datetime.datetime(2001,1,10)
        self.start_test = datetime.datetime(2005,1,1)
        self.end_period = datetime.datetime(2005,12,31)

    def fit_model(self):
        """Fits a Quadratic Discriminant Analyser to the
        US stock market index (^GPSC in Yahoo)."""
        # Create a lagged series of the S&P500 US stock market index
        snpret = create_lagged_series(self.symbol, self.start_train, 
                                      self.end_period, lags=5) 

        # Use the prior two days of returns as 
        # predictor values, with direction as the response
        X = snpret[["Lag1","Lag2"]]
        y = snpret["Direction"]

        # Create training and test sets
        X_train = X[X.index < self.start_test]
        y_train = y[y.index < self.start_test]

        # Create the predicting factors for use 
        # in direction forecasting
        self.predictors = X[X.index >= self.start_test]

        # Create the Quadratic Discriminant Analysis model
        # and the forecasting strategy
        self.model = QuadraticDiscriminantAnalysis()
        self.model.fit(X_train, y_train)

    def generate_signals(self):
        
        """Returns the DataFrame of symbols containing the signals
        to go long, short or hold (1, -1 or 0)."""
        signals = pd.DataFrame(index=self.bars.index)
        signals['signal'] = 0.0       

        # Predict the subsequent period with the QDA model
        signals['signal'] = self.model.predict(self.predictors)

        # Remove the first five signal entries to eliminate
        # NaN issues with the signals DataFrame
        signals['signal'][0:5] = 0.0
        signals['positions'] = signals['signal'].diff() 

        return signals
예제 #5
0
def doQDA(x,digits,s):
    myLDA = LDA()
    myLDA.fit(x.PCA[:,:s],digits.train_Labels)
    newtest = digits.test_Images -x.centers
    [email protected](x.V[:s,:])
    labels = myLDA.predict(newtest)
    errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels)
    return errors
예제 #6
0
def confusion(digits):
    myLDA = LDA()
    x = center_matrix_SVD(digits.train_Images)
    myLDA.fit(x.PCA[:,:50],digits.train_Labels)
    newtest = digits.test_Images -x.centers
    [email protected](x.V[:50,:])
    labels = myLDA.predict(newtest)
    import sklearn.metrics as f
    print(f.confusion_matrix(digits.test_Labels,labels))
def test_qda_priors():
    clf = QuadraticDiscriminantAnalysis()
    y_pred = clf.fit(X6, y6).predict(X6)
    n_pos = np.sum(y_pred == 2)

    neg = 1e-10
    clf = QuadraticDiscriminantAnalysis(priors=np.array([neg, 1 - neg]))
    y_pred = clf.fit(X6, y6).predict(X6)
    n_pos2 = np.sum(y_pred == 2)

    assert_greater(n_pos2, n_pos)
예제 #8
0
def get_QDA(Xtrain, Ytrain, Xtest = None , Ytest = None, verbose = 0):
    qda = QDA()
    qda.fit(Xtrain,Ytrain)
    
    scores = np.empty((2))
    if (verbose == 1):
        scores[0] = qda.score(Xtrain,Ytrain)
        print('QDA, train: {0:.02f}% '.format(scores[0]*100))
        if (type(Xtest) != type(None)):
            scores[1] = qda.score(Xtest,Ytest)
            print('QDA, test: {0:.02f}% '.format(scores[1]*100))
    return qda
예제 #9
0
def QD(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelQD=QuadraticDiscriminantAnalysis()
     modelQD.fit(train_desc,np.array(train_labels))
     joblib.dump((modelQD, img_classes, stdSlr), pth+"/qd-bof.pkl", compress=3) 
     test(pth, "qd-")
예제 #10
0
def crossValidate(attributes, outcomes, foldCount, ownFunction=True):
    	presList =[]; recallList = []
	accrList = []; fMeasList = []
	aucList = []
	testingEstimate = []

	otcmVal = list(set(outcomes))
	params = {}; featLen = 4; 

	attrFolds = getFolds(attributes,foldCount)
	otcmFolds = getFolds(outcomes,foldCount)

	testDataList = copy.copy(attrFolds)
	testOtcmList = copy.copy(otcmFolds)

	
	for itr in range(foldCount):
		trainDataList = []
		trainOtcmList = []
		for intitr in range (foldCount):
			if intitr != itr:
				trainDataList.append(attrFolds[intitr]) 
				trainOtcmList.append(otcmFolds[intitr])

		trainDataArr = 	np.array(trainDataList).reshape(-1,featLen)
		trainOtcmArr =  np.array(trainOtcmList).reshape(-1)
		testDataArr = np.array(testDataList[itr]).reshape(-1,featLen)
		testOtcmArr = np.array(testOtcmList[itr]).reshape(-1)

		if ownFunction:
			params = getParams(trainDataArr,trainOtcmArr,otcmVal,featLen)
			testingEstimate = gdaNDEstimate(testDataArr,params,otcmVal)
		else:
			#clf = LinearDiscriminantAnalysis()
			clf = QuadraticDiscriminantAnalysis()
			clf.fit(trainDataArr,trainOtcmArr)
			trainingEstimate = clf.predict(trainDataArr) 
			testingEstimate = clf.predict(testDataArr)

		if itr == 0 and len(otcmVal)==2:			
			addTitle = "Own" if ownFunction else "Inbuilt"
			metric = getMetrics(testOtcmArr,testingEstimate,otcmVal,showPlot=True,title="GDA2D Versicolor,Virginica - %s"%addTitle)
		else:
			metric = getMetrics(testOtcmArr,testingEstimate,otcmVal)
		accrList.append(metric[0])
		presList.append(metric[1])
		recallList.append(metric[2])
		fMeasList.append(metric[3])
		aucList.append(metric[4])
		
	return accrList, presList, recallList, fMeasList, aucList
예제 #11
0
def test():
    for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
        # Linear Discriminant Analysis
        lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
        y_pred = lda.fit(X, y).predict(X)
        splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
        plot_lda_cov(lda, splot)
        plt.axis('tight')
    
        # Quadratic Discriminant Analysis
        qda = QuadraticDiscriminantAnalysis(store_covariances=True)
        y_pred = qda.fit(X, y).predict(X)
        splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
        plot_qda_cov(qda, splot)
        plt.axis('tight')
    plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis')
    plt.show()
class QuadraticDiscriminantAnalysisPredictor(PredictorBase):
    '''
    Quadratic Discriminant Analysis
    '''

    def __init__(self):
        self.clf = QuadraticDiscriminantAnalysis()

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        predictions = self.clf.predict_proba(X_test)
        predictions_df = self.bundle_predictions(predictions)

        return predictions_df

    def get_k_best_k(self):
        return 4
def test_qda_regularization():
    # the default is reg_param=0. and will cause issues
    # when there is a constant variable
    clf = QuadraticDiscriminantAnalysis()
    with ignore_warnings():
        y_pred = clf.fit(X2, y6).predict(X2)
    assert np.any(y_pred != y6)

    # adding a little regularization fixes the problem
    clf = QuadraticDiscriminantAnalysis(reg_param=0.01)
    with ignore_warnings():
        clf.fit(X2, y6)
    y_pred = clf.predict(X2)
    assert_array_equal(y_pred, y6)

    # Case n_samples_in_a_class < n_features
    clf = QuadraticDiscriminantAnalysis(reg_param=0.1)
    with ignore_warnings():
        clf.fit(X5, y5)
    y_pred5 = clf.predict(X5)
    assert_array_equal(y_pred5, y5)
예제 #14
0
    def create_symbol_forecast_model(self):
        # Create a lagged series of the S&P500 US stock market index
        snpret = create_lagged_series(
            self.symbol_list[0], self.model_start_date,
            self.model_end_date, lags=5
        )

        # Use the prior two days of returns as predictor
        # values, with direction as the response
        X = snpret[["Lag1", "Lag2"]]
        y = snpret["Direction"]
        
        # Skip days with NaN
        skip_till_date = self.model_start_date + relativedelta(days=3)
        X = X[X.index > skip_till_date]
        y = y[y.index > skip_till_date]
        logging.debug(snpret[snpret.index > skip_till_date])

        model = QDA()
        model.fit(X, y)
        return model
예제 #15
0
파일: mustang.py 프로젝트: rioubenson/eagle
    def set_up_classifier(self):
        historic_data = self.get_data()
        # Key is to identify a trend (use close for now)
        historic_data['return_5_timeframe'] = np.log(historic_data['Close'] / historic_data['Close'].shift(5)) * 100
        historic_data.fillna(0.0001, inplace=True)
        historic_data['vol_normalised'] = normalise_data(historic_data['Volume'])

        # Bucket Return
        def bucket_return(x, col):
            if 0 < x[col] < 0.02:
                return 1
            if 0.02 < x[col] < 0.1:
                return 2
            if x[col] > 0.1:
                return 3

            if 0 > x[col] > -0.02:
                return -1
            if -0.02 > x[col] > -0.1:
                return -2
            if x[col] < -0.1:
                return -3
            else:
                return 0

        historic_data['Return'] = historic_data.apply(bucket_return, axis=1, args=['return_5_timeframe'])

        historic_data['Move'] = historic_data['Close'] - historic_data['Open']

        # X as predictor values, with Y as the response
        x = historic_data[["Move"]]
        y = historic_data["Return"]

        model = QuadraticDiscriminantAnalysis()
        model.fit(x, y)
        return model
예제 #16
0
    def train_DA(self, X, y, lda_comp, qda_reg):
        '''
        Input: 
            qda_reg - reg_param
            lda_comp - n_components
            X - data matrix (train_num, feat_num)
            y - target labels matrix (train_num, label_num)

        Output: 
            best_clf - best classifier trained (QDA/LDA)
            best_score - CV score of best classifier

        Find best DA classifier.
        '''
        n_samples, n_feat = X.shape
        cv_folds = 10
        kf = KFold(n_samples, cv_folds, shuffle=False)

        
        
        lda = LinearDiscriminantAnalysis(n_components = lda_comp)
        qda = QuadraticDiscriminantAnalysis(reg_param = qda_reg)
        score_total_lda = 0 #running total of metric score over all cv runs
        score_total_qda = 0 #running total of metric score over all cv runs
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            lda.fit(X_train, y_train)
            cv_pred_lda = lda.predict(X_test)
            score_lda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")')
            score_total_lda += score_lda
            
            qda.fit(X_train,y_train)
            cv_pred_qda = qda.predict(X_test)
            score_qda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")')
            score_total_qda += score_qda

        score_lda = score_total_lda/cv_folds
        score_qda = score_total_qda/cv_folds
        
        # We keep the best one
        if(score_qda > score_lda):
            qda.fit(X,y)
            return qda, score_qda
        else:
            lda.fit(X,y)
            return lda, score_lda
예제 #17
0
파일: forecaster.py 프로젝트: Vegeb/strats
    def fit_model(self):
        """Fits a Quadratic Discriminant Analyser to the
        US stock market index (^GPSC in Yahoo)."""
        # Create a lagged series of the S&P500 US stock market index
        snpret = create_lagged_series(self.symbol, self.start_train, 
                                      self.end_period, lags=5) 

        # Use the prior two days of returns as 
        # predictor values, with direction as the response
        X = snpret[["Lag1","Lag2"]]
        y = snpret["Direction"]

        # Create training and test sets
        X_train = X[X.index < self.start_test]
        y_train = y[y.index < self.start_test]

        # Create the predicting factors for use 
        # in direction forecasting
        self.predictors = X[X.index >= self.start_test]

        # Create the Quadratic Discriminant Analysis model
        # and the forecasting strategy
        self.model = QuadraticDiscriminantAnalysis()
        self.model.fit(X_train, y_train)
def test_qda():
    # QDA classification.
    # This checks that QDA implements fit and predict and returns
    # correct values for a simple toy dataset.
    clf = QuadraticDiscriminantAnalysis()
    y_pred = clf.fit(X6, y6).predict(X6)
    assert_array_equal(y_pred, y6)

    # Assure that it works with 1D data
    y_pred1 = clf.fit(X7, y6).predict(X7)
    assert_array_equal(y_pred1, y6)

    # Test probas estimates
    y_proba_pred1 = clf.predict_proba(X7)
    assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6)
    y_log_proba_pred1 = clf.predict_log_proba(X7)
    assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8)

    y_pred3 = clf.fit(X6, y7).predict(X6)
    # QDA shouldn't be able to separate those
    assert np.any(y_pred3 != y7)

    # Classes should have at least 2 elements
    assert_raises(ValueError, clf.fit, X6, y4)
예제 #19
0
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = [[KNeighborsClassifier(3), 'KNN'],
               [SVC(probability=True), 'SVC'],
               [DecisionTreeClassifier(), 'Decision Tree'],
               [RandomForestClassifier(), 'Random Forest'],
               [AdaBoostClassifier(), 'ADA booster'],
               [GradientBoostingClassifier(), 'Gradient Booster'],
               [GaussianNB(), 'Gaussian Nb'],
               [LinearDiscriminantAnalysis(), 'Linear Discriminant Analysis'],
               [QuadraticDiscriminantAnalysis(), 'Quadratic Discrimination'],
               [LogisticRegression(), 'Logistic Regression']]

X = train.drop("Survived", axis=1)
y = train["Survived"]
X_test = test

scores = []

for clf in classifiers:

    clf = clf[0]

    clf.fit(X, y)
    y_pred = clf.predict(X_test)
X = training.iloc[:,1:-1].values
y = training['country_destination'].values
"""
# Use Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
trans = LinearDiscriminantAnalysis(n_components=3)
trans.fit(X,y)
X = trans.transform(X)
"""
# Split Up Data
x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3,random_state=None)

# Train classifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis(reg_param=0.00001)
clf.fit(x_train,y_train)

# Run Predictions
from sklearn.metrics import confusion_matrix, accuracy_score
y_preds = clf.predict(x_valid)
print( confusion_matrix(y_valid,y_preds) );
print( "Accuracy: %f" % (accuracy_score(y_valid,y_preds)) );
f = open('qda_take1.txt', 'w')
f.write( str(confusion_matrix(y_valid,y_preds)) );
f.write( "\nAccuracy: %f" % (accuracy_score(y_valid,y_preds)) );
f.write( "\nclf = QuadraticDiscriminantAnalysis(0.00001)" );

# Now on to final submission
x_final = testing.iloc[:,1:].values
y_final = clf.predict(x_final).reshape([62096,]);
예제 #21
0
import window_s_p_ft as win
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.cross_validation import train_test_split


total_score = 0
stop = 1000
for x in range(stop):
    clf = QuadraticDiscriminantAnalysis()
    data = win.getStudents()
    data_train, data_test = train_test_split(data, test_size=0.2)
    data_train_labels = [s.spec for s in data_train]
    data_test_labels = [s.spec for s in data_test]
    data_train = [s.grades for s in data_train]
    data_test = [s.grades for s in data_test]
    clf.fit(data_train, data_train_labels)
    total_score += clf.score(data_test, data_test_labels)
total_score = total_score / stop
print("all")
print(total_score)

specs = ["FK", "FM", "MN", "OE"]
for sp in specs:
    total_score = 0
    for x in range(stop):
        clf = QuadraticDiscriminantAnalysis()
        data = win.getStudents()
        data_train, data_test = train_test_split(data, test_size=0.2)
        data_train_labels = [s.spec if s.spec == sp else "NOT " + sp for s in data_train]
        data_test_labels = [s.spec if s.spec == sp else "NOT " + sp for s in data_test]
        data_train = [s.grades for s in data_train]
예제 #22
0
    #
    ###########################################################################

    # get training, validation and test datasets for specified roi
    training_data, validation_data, test_data = ds.split_data()

    ###########################################################################
    #
    #        CREATE MODEL
    #
    ###########################################################################

    # Define the estimator: quadratic discriminant analysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

    qda = QuadraticDiscriminantAnalysis()

    qda.fit(training_data[0], training_data[1])

    from sklearn.metrics import accuracy_score

    # record the best result
    accuracies[i] = accuracy_score(test_data[1], qda.predict(test_data[0]))


mean_accuracy = accuracies.mean()
print("\n\nmean accuracy: %f" % mean_accuracy)

###############################################################################
#
#   VISUALIZE
예제 #23
0
    pyplot.show()

    # Question 2B
    # Reprenons les paires de mesures, mais entrainons cette fois
    # differents modeles demandes avec chaque paire
    for (f1, f2) in pairs:
        # TODO Q2B
        # Creez ici un sous-dataset contenant seulement les
        # mesures designees par f1 et f2
        subData = data.data[:, (f1, f2)]

        # TODO Q2B
        # Initialisez ici les differents classifieurs, dans
        # une liste nommee "classifieurs"
        classifieurs = [
            QuadraticDiscriminantAnalysis(),
            LinearDiscriminantAnalysis(),
            GaussianNB(),
            NearestCentroid()
        ]

        # TODO Q2B
        # Creez ici une grille permettant d'afficher les regions de
        # decision pour chaque classifieur
        # Indice : numpy.meshgrid pourrait vous etre utile ici
        # N'utilisez pas un pas trop petit!

        x = numpy.arange(min(subData[:, 0]), max(subData[:, 0]), 0.05)
        y = numpy.arange(min(subData[:, 1]), max(subData[:, 1]), 0.05)
        xx, yy = numpy.meshgrid(x, y)
예제 #24
0
train_x_std = train_x
cv_x_std = cv_x

lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
lda.fit(train_x_std, train_y)

train_preds_lda = lda.predict(train_x_std)
cv_preds_lda = lda.predict(cv_x_std)

train_acc_lda = accuracy_score(train_preds_lda, train_y)
cv_acc_lda = accuracy_score(cv_preds_lda, cv_y)

print("Training accuracy for linear discriminant analysis is ", train_acc_lda)
print("CV accuracy for linear discriminant analysis is ", cv_acc_lda)

qda = QuadraticDiscriminantAnalysis(store_covariance=True)
qda.fit(train_x_std, train_y)

train_preds_qda = qda.predict(train_x_std)
cv_preds_qda = qda.predict(cv_x_std)

train_acc_qda = accuracy_score(train_preds_qda, train_y)
cv_acc_qda = accuracy_score(cv_preds_qda, cv_y)

print("Training accuracy for Quadratic discriminant analysis is ",
      train_acc_qda)
print("CV accuracy for Quadratic discriminant analysis is ", cv_acc_qda)

logReg = linear_model.LogisticRegression(C=1)
logReg.fit(train_x_std, train_y)
#Value of C=1 was obtained after some tries
def main():

    # Checks for correct number of arguments
    if len(sys.argv) != 3:
        print(
            'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]')
        sys.exit()

    # set up dataset
    data_train = pd.read_csv(sys.argv[1])
    data_test = pd.read_csv(sys.argv[2])

    print('train: {}'.format(sys.argv[1]))
    print('test: {}'.format(sys.argv[2]))

    x_train = data_train.drop(
        [data_train.columns[0], data_train.columns[1], data_train.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_train = pd.Series(data_train.iloc[:, -1])
    x_test = data_test.drop(
        [data_test.columns[0], data_test.columns[1], data_test.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_test = pd.Series(data_test.iloc[:, -1])

    type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ')
    if type == 1:
        method = input('method: [1: classification, 2: regression] ')
        if method == 1:
            classifier = input(
                'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] '
            )
            if classifier == 1:
                criterion = input('criterion: [1: gini, 2: entropy] ')
                if criterion == 1:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='gini')
                elif criterion == 2:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='entropy')
                else:
                    print('no criterion chosen')
                    exit()
            elif classifier == 2:
                print(type, method, classifier)
                model = ExtraTreeClassifier()
            elif classifier == 3:
                print(type, method, classifier)
                model = ExtraTreesClassifier()
            elif classifier == 4:
                n = input('n: [1: 1, 2: 3: 3: 5] ')
                if n == 1:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=1)
                elif n == 2:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=3)
                elif n == 3:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=5)
                else:
                    print('no n chosen')
                    exit()
            elif classifier == 5:
                version = input(
                    'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] '
                )
                if version == 1:
                    print(type, method, classifier, version)
                    model = GaussianNB()
                elif version == 2:
                    print(type, method, classifier, version)
                    model = BernoulliNB()
                elif version == 3:
                    print(type, method, classifier, version)
                    model = MultinomialNB()
                elif version == 4:
                    print(type, method, classifier, version)
                    model = ComplementNB()
                else:
                    print('no version chosen')
                    exit()
            elif classifier == 6:
                print(type, method, classifier)
                model = RadiusNeighborsClassifier(radius=1.0)
            elif classifier == 7:
                print(type, method, classifier)
                model = RandomForestClassifier(n_estimators=50, random_state=1)
            elif classifier == 8:
                print(type, method, classifier)
                model = LinearSVC(
                    multi_class='crammer_singer')  #multi_class='ovr'
            elif classifier == 9:
                print(type, method, classifier)
                model = GradientBoostingClassifier()
            elif classifier == 10:
                print(type, method, classifier)
                model = GaussianProcessClassifier(multi_class='one_vs_one')
                # model = GaussianProcessClassifier(multi_class='one_vs_rest')
            elif classifier == 11:
                print(type, method, classifier)
                model = SGDClassifier()
            elif classifier == 12:
                print(type, method, classifier)
                model = PassiveAggressiveClassifier()
            elif classifier == 13:
                print(type, method, classifier)
                model = NearestCentroid()
            elif classifier == 14:
                print(type, method, classifier)
                model = Perceptron(tol=1e-3, random_state=0)
            elif classifier == 15:
                print(type, method, classifier)
                model = MLPClassifier()
            elif classifier == 16:
                print(type, method, classifier)
                model = AdaBoostClassifier(n_estimators=100)
            else:
                print('no classifier chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # predict output
            predictions = pd.Series(model.predict(x_test))

            filename = '{},{},{}.txt'.format(type, method, classifier)
            with open(filename, 'w') as output:
                output.write('{:10}\t{:10}\t{:10}\t{:10}'.format(
                    'actual', 'predict', 'approximate', 'match?'))
                for i in range(len(predictions)):
                    match = True if (y_test[i] == predictions[i]) else False
                    output.write('{:10}\t{:10}\t{:10}'.format(
                        y_train[i], predictions[i], match))
                output.write('accuracy: {:7.2f}%'.format(
                    100 * accuracy_score(y_test, predictions)))

            print('accuracy: {:7.2f}%'.format(
                100 * accuracy_score(y_test, predictions)))
            print(
                classification_report(
                    y_test,
                    predictions,
                    target_names=['RightTroll', 'LeftTroll', 'Other']))
            print(
                confusion_matrix(y_test,
                                 predictions,
                                 labels=["RightTroll", "LeftTroll", "Other"]))
        elif method == 2:
            # transform into binary classification problem
            # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1)
            # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1)

            # transform string labels into integers
            # le = LabelEncoder()
            # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1]))
            # print(le.classes_)
            #
            # y_train = le.transform(y_train)
            # y_test = le.transform(y_test)

            regressor = input(
                'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] '
            )
            if regressor == 1:
                print(type, method, regressor)
                model = LinearDiscriminantAnalysis()
            elif regressor == 2:
                print(type, method, regressor)
                model = LogisticRegression(
                    solver='lbfgs', multi_class='multinomial')  #'newton-cg'
            elif regressor == 3:
                print(type, method, regressor)
                model = RidgeClassifier()
            elif regressor == 4:
                print(type, method, regressor)
                model = QuadraticDiscriminantAnalysis()
            elif regressor == 5:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(LinearRegression())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(LinearRegression())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 6:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(DecisionTreeRegressor())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(DecisionTreeRegressor())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 7:
                print(type, method, regressor)
                model = PLSRegression(n_components=2)
            elif regressor == 8:
                print(type, method, regressor)
                model = PLSCanonical(n_components=2)
            elif regressor == 9:
                print(type, method, regressor)
                model = CCA(n_components=1)
            elif regressor == 10:
                print(type, method, regressor)
                model = Lasso(alpha=0.1)
            elif regressor == 11:
                print(type, method, regressor)
                model = MultiTaskLasso(alpha=0.1)
            elif regressor == 12:
                print(type, method, regressor)
                model = ElasticNet(random_state=0)
            elif regressor == 13:
                print(type, method, regressor)
                model = MultiTaskElasticNet(random_state=0)
            elif regressor == 14:
                print(type, method, regressor)
                model = Lars(n_nonzero_coefs=1)
            elif regressor == 15:
                print(type, method, regressor)
                model = LassoLars(alpha=.1)
            elif regressor == 16:
                print(type, method, regressor)
                model = OrthogonalMatchingPursuit()
            elif regressor == 17:
                print(type, method, regressor)
                model = BayesianRidge()
            elif regressor == 18:
                print(type, method, regressor)
                model = ARDRegression()
            elif regressor == 19:
                print(type, method, regressor)
                model = TheilSenRegressor(random_state=0)
            elif regressor == 20:
                print(type, method, regressor)
                model = HuberRegressor()
            elif regressor == 21:
                print(type, method, regressor)
                model = RANSACRegressor(random_state=0)
            else:
                print('no regressor chosen')
                exit()

            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # print('coefficient:', model.coef_)
            # print('intercept:', model.intercept_)

            # predict output
            predictions = pd.Series(model.predict(x_test))
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # calculate accuracy
            numerator = 0.0
            denominator = float(len(predictions))
            for i in range(len(predictions)):
                match = True if (y_test[i] == predictions[i]) else False
                numerator += 1 if match else 0
                print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                                   match))
            print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))

        else:
            print('no method chosen')
            exit()
    elif type == 2:
        classifier = input(
            'classifier: [1: label propagation, 2: label spreading] ')
        if classifier == 1:
            print(type, classifier)
            model = LabelPropagation()
        elif classifier == 2:
            print(type, classifier)
            model = LabelSpreading()
        else:
            print('no classifier chosen')
            exit()
        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    elif type == 3:
        method = input(
            'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] '
        )
        if method == 1:
            clusterer = input('clustere: [1: k means]')
            if clusterer == 1:
                clusters = input('clusters: [1: 1, 2: 2, 3: 3] ')
                if clusters == 1:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=1, random_state=0)
                elif clusters == 2:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=2, random_state=0)
                elif clusters == 3:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=3, random_state=0)
                else:
                    print('no clusters chosen')
                    exit()
            else:
                print('no clusterer chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.predict(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # check details
            print('centroids: ' + model.cluster_centers_)
            # print('labels: ' + model.labels_)
        elif method == 2:
            model = RandomTreesEmbedding()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.apply(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))
        elif method == 3:
            model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
            # train the model using the training sets and check score
            model.fit(x_train)
            distances, indices = nbrs.kneighbors(X)

        else:
            print('no method chosen')
            exit()

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    else:
        print('no type chosen')
        exit()
예제 #26
0
df = pd.DataFrame()


for i in range(1, 21):

       #Train test split
       X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, 
       stratify=public_labels, random_state=i*500)

       #Vettorizzare i label
       train_labels_encoded = encoder.fit_transform(y_train)
       test_labels_encoded = encoder.transform(y_test)

       #QuadraticDiscriminantAnalysis
       steps = [('scaler', MinMaxScaler()), ('clf', QuadraticDiscriminantAnalysis())]

       pipeline = Pipeline(steps)

       parameteres = [{'scaler':scalers_to_test}]

       grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5, n_jobs=-1, verbose=1)

       grid.fit(X_train, y_train)

       score_train = grid.score(X_train, y_train)
       score_test = grid.score(X_test, y_test)
       best_p = grid.best_params_

       bp = pd.DataFrame(best_p, index=[i])
       bp['accuracy_train'] = score_train
예제 #27
0
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Importing the Iris dataset
# Dataset 1 - Iris Setosa
#      		2 - Iris Versicolour
#      		3 - Iris Virginica
samples = np.loadtxt('../../Dataset/iris/iris.data', delimiter=',')
# Extracting only Petal length and width
X = samples[:, 2:4]
y = samples[:, 4]

# Performing LDA
lda = LDA()
lda.fit(X, y)

# Performing QDA
qda = QDA()
qda.fit(X, y)

reg_param = [0.001, 0.01, 0.1, 1, 10]
for r in reg_param:
    qda = QDA(reg_param=r)
    qda.fit(X, y)
예제 #28
0
def quadratic_discriminant(x_train, y_train, x_test, y_test):
    clf = QuadraticDiscriminantAnalysis()
    return __fit_clf_model('quadratic_discriminant', clf, x_train, y_train,
                           x_test, y_test)
예제 #29
0
파일: train.py 프로젝트: Tempelx/ECG_gender
def train_classifiers(X, y, comment):
    """""
    Main function for training

    Parameters:
    ----------
    X               np.array        Features 
    y               np.array        Class labels
    comment         String          Comment for ROC Plot

    Returns:
    ---------
    df              pd.DataFrame    Trained Hyperparameter scoring
    """ ""
    from sklearn.svm import SVC
    from xgboost import XGBClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import AdaBoostClassifier

    models = {
        'SVC': SVC(probability=True),
        'GaussianNB': GaussianNB(),
        'LogisticRegression': LogisticRegression(),
        'MLPClassifier': MLPClassifier(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        'XGBClassifier': XGBClassifier(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'RandomForestClassifier': RandomForestClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier()
    }

    tuned_parameters = {
        'SVC': {
            'kernel': ['rbf', 'linear'],
            'gamma': [1e-3, 1e-4],
            'C': [1, 10, 100, 1000, 10000]
        },
        'GaussianNB': {},
        'LogisticRegression': {
            'C': [0.1, 0.5, 1, 10, 100]
        },
        'MLPClassifier': [{
            'solver': ['lbfgs', 'sgd', 'adam']
        }, {
            'alpha': [0.0001, 0.00001, 0.0010]
        }],
        'KNeighborsClassifier': {
            'n_neighbors': [5, 10, 15, 20]
        },
        'XGBClassifier': {},
        'QuadraticDiscriminantAnalysis': {},
        'ExtraTreesClassifier': {
            'n_estimators': [16, 32]
        },
        'RandomForestClassifier': [{
            'n_estimators': [16, 32]
        }, {
            'criterion': ['gini', 'entropy'],
            'n_estimators': [8, 16]
        }],
        'AdaBoostClassifier': {
            'n_estimators': [16, 32]
        },
        'GradientBoostingClassifier': {
            'n_estimators': [16, 32],
            'learning_rate': [0.8, 1.0]
        }
    }

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    trainer = ParameterEstimator(models, tuned_parameters)

    trainer.fit(X_train, y_train, scoring='f1', n_jobs=2)

    trainer.fit_test(X_test, y_test, X_train, y_train, trainer.score(),
                     comment)

    return trainer.score()
    classifier = AdaBoostClassifier()
elif model_name == model_names[3]:
    print('Using Random Forest Classifier.')
    classifier = RandomForestClassifier(class_weight='balanced')
elif model_name == model_names[4]:
    print('Using Naive Bayes Classifier.')
    classifier = GaussianNB()
elif model_name == model_names[5]:
    print('Using Support Vector Machines Classifier.')
    classifier = SVC(probability=True, class_weight='balanced')
elif model_name == model_names[6]:
    print('Using Linear Discriminant Analysis.')
    classifier = LinearDiscriminantAnalysis()
elif model_name == model_names[7]:
    print('Using Quadratic Discriminant Analysis.')
    classifier = QuadraticDiscriminantAnalysis()
else:
    print('Unknown option for classifier, using naive bayes as default one!')
    model_name = 'naive_bayes'
    classifier = GaussianNB()

# Feature Scaling
sc = StandardScaler()

print('Using Repeated Stratified K-Fold Cross Validation.\n')
# Using Repeated Stratified K-Fold Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=36851234)

accuracy_list = []
pr_auc_list = []
roc_auc_list = []
예제 #31
0
    def learn(self, fname):
        t = time.time()
        # load CSV file
        self.header = []
        rows = []
        naming_num = 0
        with open(fname, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            for i, row in enumerate(reader):
                if i == 0:
                    self.header = row
                else:
                    for j, val in enumerate(row):
                        if val == '':
                            row[j] = 0
                            continue
                        try:
                            row[j] = float(val)
                        except:
                            if val not in self.naming['from']:
                                self.naming['from'][val] = naming_num
                                self.naming['to'][naming_num] = val
                                naming_num += 1
                            row[j] = self.naming['from'][val]
                    rows.append(row)

        # first column in row is the classification, Y
        y = numpy.zeros(len(rows))
        x = numpy.zeros((len(rows), len(rows[0]) - 1))

        # shuffle it up for training
        record_range = list(range(len(rows)))
        shuffle(record_range)
        for i in record_range:
            y[i] = rows[i][0]
            x[i, :] = numpy.array(rows[i][1:])

        names = [
            "Nearest Neighbors",
            "Linear SVM",
            "RBF SVM",
            # "Gaussian Process",
            "Decision Tree",
            "Random Forest",
            "Neural Net",
            "AdaBoost",
            "Naive Bayes",
            "QDA"
        ]
        classifiers = [
            KNeighborsClassifier(3),
            SVC(kernel="linear", C=0.025, probability=True),
            SVC(gamma=2, C=1, probability=True),
            # GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
            # GaussianProcess takes too long!
            DecisionTreeClassifier(max_depth=5),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=10,
                                   max_features=1),
            MLPClassifier(alpha=1),
            AdaBoostClassifier(),
            GaussianNB(),
            QuadraticDiscriminantAnalysis()
        ]
        self.algorithms = {}
        # split_for_learning = int(0.70 * len(y))
        for name, clf in zip(names, classifiers):
            t2 = time.time()
            self.algorithms[name] = clf
            try:
                self.algorithms[name].fit(x, y)
                # self.algorithms[name].fit(x[:split_for_learning],
                #                           y[:split_for_learning])
                # score = self.algorithms[name].score(x[split_for_learning:], y[
                #     split_for_learning:])
                # print(name, score)
            except:
                pass
            self.logger.debug("learned {}, {:d} ms".format(
                name, int(1000 * (t2 - time.time()))))

        t2 = time.time()
        name = "Extended Naive Bayes"
        clf = ExtendedNaiveBayes(self.family, path_to_data=self.path_to_data)
        clf.fit(fname)
        self.logger.debug("learned {}, {:d} ms".format(
            name, int(1000 * (t2 - time.time()))))

        self.logger.debug("{:d} ms".format(int(1000 * (t - time.time()))))
예제 #32
0
def main():
    """Run experiment with multiple classifiers."""
    # Get classifiers
    classifiers = [
        ('Decision Tree', DecisionTreeClassifier(max_depth=5)),
        ('Random Forest',
         RandomForestClassifier(n_estimators=50, n_jobs=10, max_features=50)),
        ('AdaBoost', AdaBoostClassifier()),
        ('Naive Bayes', GaussianNB()),
        ('LDA', LinearDiscriminantAnalysis()),
        ('QDA', QuadraticDiscriminantAnalysis()),
        # ('Random Forest 2', RandomForestClassifier(max_depth=5,
        #                                            n_estimators=10,
        #                                            max_features=1,
        #                                            n_jobs=10)),
        # ('Logistic Regression (C=1)', LogisticRegression(C=1)),
        # #('Logistic Regression (C=1000)', LogisticRegression(C=10000)),
        # ('RBM 200, n_iter=40, LR=0.01, Reg: C=1',
        #  Pipeline(steps=[('rbm', BernoulliRBM(n_components=200,
        #                                       n_iter=40,
        #                                       learning_rate=0.01,
        #                                       verbose=True)),
        #                  ('logistic', LogisticRegression(C=1))])),
        # ('RBM 200, n_iter=40, LR=0.01, Reg: C=10000',
        #  Pipeline(steps=[('rbm', BernoulliRBM(n_components=200,
        #                                       n_iter=40,
        #                                       learning_rate=0.01,
        #                                       verbose=True)),
        #                  ('logistic', LogisticRegression(C=10000))])),
        # ('RBM 100', Pipeline(steps=[('rbm', BernoulliRBM(n_components=100)),
        #                             ('logistic', LogisticRegression(C=1))])),
        # ('RBM 100, n_iter=20',
        #  Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, n_iter=20)),
        #                  ('logistic', LogisticRegression(C=1))])),
        # ('RBM 256', Pipeline(steps=[('rbm', BernoulliRBM(n_components=256)),
        #                             ('logistic', LogisticRegression(C=1))])),
        # ('RBM 512, n_iter=100',
        #  Pipeline(steps=[('rbm', BernoulliRBM(n_components=512, n_iter=10)),
        #                  ('logistic', LogisticRegression(C=1))])),
        # ('NN 20:5', skflow.TensorFlowDNNClassifier(hidden_units=[20, 5],
        #                                            n_classes=data['n_classes'],
        #                                            steps=500)),
        # ('NN 500:200 dropout',
        # ('CNN', skflow.TensorFlowEstimator(model_fn=conv_model,
        #                                    n_classes=10,
        #                                    batch_size=100,
        #                                    steps=20000,
        #                                    learning_rate=0.001)),
        # ('SVM, adj.', SVC(probability=False,
        #                   kernel="rbf",
        #                   C=2.8,
        #                   gamma=.0073,
        #                   cache_size=200)),
        # ('SVM, linear', SVC(kernel="linear", C=0.025, cache_size=200)),
        # ('k nn', KNeighborsClassifier(3)),
        # ('Gradient Boosting', GradientBoostingClassifier())
    ]

    data = get_data('hasy')

    # Fit them all
    classifier_data = {}
    with open('classifier-comp.md', 'w') as f:
        for clf_name, clf in classifiers:
            print(clf_name)
            classifier_data[clf_name] = []
            f.write("#" * 80)
            f.write("\n")
            f.write("Start fitting '%s' classifier.\n" % clf_name)
            for fold in range(len(data)):
                print(data[fold]['test']['X'].shape)
                print(data[fold]['test']['y'].shape)

                print("Got %i training samples and %i test samples." % (len(
                    data[fold]['train']['X']), len(data[fold]['test']['X'])))
                t0 = time.time()
                examples = 10**9
                clf.fit(data[fold]['train']['X'][:examples],
                        data[fold]['train']['y'][:examples])
                t1 = time.time()
                an_data = analyze(clf,
                                  data[fold],
                                  t1 - t0,
                                  clf_name=clf_name,
                                  handle=f)
                classifier_data[clf_name].append({
                    'training_time':
                    t1 - t0,
                    'testing_time':
                    an_data['testing_time'],
                    'accuracy':
                    an_data['accuracy']
                })

    pretty_print(classifier_data)
예제 #33
0
    splot.set_xticks(())
    splot.set_yticks(())


def plot_lda_cov(lda, splot):
    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')


def plot_qda_cov(qda, splot):
    plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')
    plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')

###############################################################################
for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
    # Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis()
    y_pred = qda.fit(X, y, store_covariances=True).predict(X)
    splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis')
plt.show()
예제 #34
0
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import StratifiedShuffleSplit 
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifier = [  KNeighborsClassifier(3), SVC(probability= True), DecisionTreeClassifier(), RandomForestClassifier(),  
                AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(),  
                QuadraticDiscriminantAnalysis(), LogisticRegression()]

log = pd.DataFrame(columns=["Classifier", "Accuracy"])

stshsp =    StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
 
a = train[0::, 1::]
b = train[0::, 0]

acct_dict = {}

for train_index, test_index in stshsp.split(a,b): 
    a_train, a_test = a[train_index], a[test_index]
    b_train, b_test = b[train_index], b[test_index]
    
예제 #35
0
파일: _classif.py 프로젝트: dmalt/brainpipe
    def __new__(self,
                y,
                clf='lda',
                kern='rbf',
                n_knn=10,
                n_tree=100,
                priors=False,
                **kwargs):

        # Use a pre-defined classifier :
        if isinstance(clf, (str, int)):
            # Default value for priors :
            priors = np.array([1 / len(np.unique(y))] * len(np.unique(y)))

            if isinstance(clf, str):
                clf = clf.lower()

            # LDA :
            if clf == 'lda' or clf == 0:
                clfObj = LinearDiscriminantAnalysis(priors=priors, **kwargs)
                clfObj.lgStr = 'Linear Discriminant Analysis'
                clfObj.shStr = 'LDA'

            # SVM : ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable
            elif clf == 'svm' or clf == 1:
                clfObj = SVC(kernel=kern, probability=True, **kwargs)
                clfObj.lgStr = 'Support Vector Machine (kernel=' + kern + ')'
                clfObj.shStr = 'SVM-' + kern

            # Linear SVM:
            elif clf == 'linearsvm' or clf == 2:
                clfObj = LinearSVC(**kwargs)
                clfObj.lgStr = 'Linear Support Vector Machine'
                clfObj.shStr = 'LSVM'

            # Nu SVM :
            elif clf == 'nusvm' or clf == 3:
                clfObj = NuSVC(**kwargs)
                clfObj.lgStr = 'Nu Support Vector Machine'
                clfObj.shStr = 'NuSVM'

            # Naive Bayesian :
            elif clf == 'nb' or clf == 4:
                clfObj = GaussianNB(**kwargs)
                clfObj.lgStr = 'Naive Baysian'
                clfObj.shStr = 'NB'

            # KNN :
            elif clf == 'knn' or clf == 5:
                clfObj = KNeighborsClassifier(n_neighbors=n_knn, **kwargs)
                clfObj.lgStr = 'k-Nearest Neighbor (neighbor=' + str(
                    n_knn) + ')'
                clfObj.shStr = 'KNN-' + str(n_knn)

            # Random forest :
            elif clf == 'rf' or clf == 6:
                clfObj = RandomForestClassifier(n_estimators=n_tree, **kwargs)
                clfObj.lgStr = 'Random Forest (tree=' + str(n_tree) + ')'
                clfObj.shStr = 'RF-' + str(n_tree)

            # Logistic regression :
            elif clf == 'lr' or clf == 7:
                clfObj = LogisticRegression(**kwargs)
                clfObj.lgStr = 'Logistic Regression'
                clfObj.shStr = 'LogReg'

            # QDA :
            elif clf == 'qda' or clf == 8:
                clfObj = QuadraticDiscriminantAnalysis(**kwargs)
                clfObj.lgStr = 'Quadratic Discriminant Analysis'
                clfObj.shStr = 'QDA'

            else:
                raise ValueError('No classifier "' + str(clf) + '"" found')

        # Use a custom classifier :
        else:
            clfObj = clf
            clfObj.shStr = 'custom'
            clfObj.lgStr = 'Custom classifier'

        return clfObj
예제 #36
0
        log_model = lr_model
    if log == 'nb':
        from sklearn.naive_bayes import GaussianNB
        lr_model = GaussianNB()
        log_model = lr_model
    if log == 'knn':
        from sklearn.neighbors import KNeighborsClassifier
        lr_model = KNeighborsClassifier(n_neighbors=35, weights='distance')
        log_model = lr_model
    if log == 'lda':
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        lr_model = LinearDiscriminantAnalysis()
        log_model = lr_model
    if log == 'qda':
        from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
        lr_model = QuadraticDiscriminantAnalysis()
        log_model = lr_model
    # Run CV

    for i, (train_index, test_index) in enumerate(kf.split(X)):

        # Create data for this fold
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = X.iloc[train_index, :].copy(), X.iloc[
            test_index, :].copy()
        X_test = test_df.copy()
        print("\nFold ", i)

        # Run model for this fold
        fit_model = log_model.fit(X_train, y_train)
예제 #37
0
def save_qda(X, y):
    qda = QDA().fit(X, y)
    y_pred = qda.predict(X)
    f = open("./results/Qda.txt", "w")
    f.write(get_model_report(y, y_pred))
예제 #38
0
    "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
    "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes",
    "QDA", "Logistic Regression", "Logistic Regression CV"
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, probability=True),
    SVC(gamma=2, C=1, probability=True),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(),
    LogisticRegressionCV()
]


def merge_row(nparr1, nparr2):
    return np.append(nparr1, nparr2, axis=1)


def average(nparr1, nparr2, r=(1.0, 1.0)):
    return (r[0] * nparr1) + (r[1] * nparr2)


merge_types = ["merge", "average"]
merge_funcs = [merge_row, average]
예제 #39
0
def qda_classifier(Xtrain, ytrain, Xtest, ytest):
    clf = QuadraticDiscriminantAnalysis()
    clf.fit(Xtrain, ytrain)
    ypred = clf.predict(Xtest)
    return evaluator('QDA', ypred, ytest)
X = X[y > 0]
y = y[y > 0]
y -= 1
target_names = iris.target_names[1:]

################################################################################
# LDA
lda = LDA()
print(X,"JJJJ")
print(y,"UUUUU")
y_pred = lda.fit(X, y)
print(lda.coef_)
print(lda.intercept_)

# QDA
qda = QDA()
y_pred = qda.fit(X, y).predict(X)

###############################################################################
# Plot results

def plot_ellipse(splot, mean, cov, color):
    v, w = linalg.eigh(cov)
    u = w[0] / linalg.norm(w[0])
    angle = np.arctan(u[1]/u[0])
    angle = 180 * angle / np.pi # convert to degrees
    # filled gaussian at 2 standard deviation
    ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5,
                                            180 + angle, color=color)
    ell.set_clip_box(splot.bbox)
    ell.set_alpha(0.5)
예제 #41
0
def discriminatePlot(X, y, cVal, titleStr=''):
    # Frederic's Robust Wrapper for discriminant analysis function.  Performs lda, qda and RF afer error checking, 
    # Generates nice plots and returns cross-validated
    # performance, stderr and base line.
    # X np array n rows x p parameters
    # y group labels n rows
    # rgb color code for each data point - should be the same for each data beloging to the same group
    # titleStr title for plots
    # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
    
    # Global Parameters
    CVFOLDS = 10
    MINCOUNT = 10
    MINCOUNTTRAINING = 5 
    
    # Initialize Variables and clean up data
    classes, classesCount = np.unique(y, return_counts = True)  # Classes to be discriminated should be same as ldaMod.classes_
    goodIndClasses = np.array([n >= MINCOUNT for n in classesCount])
    goodInd = np.array([b in classes[goodIndClasses] for b in y])
    yGood = y[goodInd]
    XGood = X[goodInd]
    cValGood = cVal[goodInd]


    classes, classesCount = np.unique(yGood, return_counts = True) 
    nClasses = classes.size         # Number of classes or groups  

    # Do we have enough data?  
    if (nClasses < 2):
        print 'Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT)
        return -1, -1, -1, -1 , -1, -1, -1
    cvFolds = min(min(classesCount), CVFOLDS)
    if (cvFolds < CVFOLDS):
        print 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS)
   
    # Data size and color values   
    nD = XGood.shape[1]                 # number of features in X
    nX = XGood.shape[0]                 # number of data points in X
    cClasses = []   # Color code for each class
    for cl in classes:
        icl = (yGood == cl).nonzero()[0][0]
        cClasses.append(np.append(cValGood[icl],1.0))
    cClasses = np.asarray(cClasses)
    myPrior = np.ones(nClasses)*(1.0/nClasses)  

    # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted.
    nDmax = int(np.fix(np.sqrt(nX/5)))
    if nDmax < nD:
        print 'Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' 
    nDmax = min(nD, nDmax)
    pca = PCA(n_components=nDmax)
    Xr = pca.fit_transform(XGood)
    print 'Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0)
    
    
    # Initialise Classifiers  
    ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') 
    qdaMod = QDA(priors = myPrior)
    rfMod = RF()   # by default assumes equal weights

        
    # Perform CVFOLDS fold cross-validation to get performance of classifiers.
    ldaScores = np.zeros(cvFolds)
    qdaScores = np.zeros(cvFolds)
    rfScores = np.zeros(cvFolds)
    skf = cross_validation.StratifiedKFold(yGood, cvFolds)
    iskf = 0
    
    for train, test in skf:
        
        # Enforce the MINCOUNT in each class for Training
        trainClasses, trainCount = np.unique(yGood[train], return_counts=True)
        goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount])
        goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]])

        # Specity the training data set, the number of groups and priors
        yTrain = yGood[train[goodIndTrain]]
        XrTrain = Xr[train[goodIndTrain]]

        trainClasses, trainCount = np.unique(yTrain, return_counts=True) 
        ntrainClasses = trainClasses.size
        
        # Skip this cross-validation fold because of insufficient data
        if ntrainClasses < 2:
            continue
        goodInd = np.array([b in trainClasses for b in yGood[test]])    
        if (goodInd.size == 0):
            continue
           
        # Fit the data
        trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses)
        ldaMod.priors = trainPriors
        qdaMod.priors = trainPriors
        ldaMod.fit(XrTrain, yTrain)
        qdaMod.fit(XrTrain, yTrain)        
        rfMod.fit(XrTrain, yTrain)
        

        ldaScores[iskf] = ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])
        qdaScores[iskf] = qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]])
        rfScores[iskf] = rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]])

        iskf += 1
     
    if (iskf !=  cvFolds):
        cvFolds = iskf
        ldaScores.reshape(cvFolds)
        qdaScores.reshape(cvFolds)
        rfScores.reshape(cvFolds)
      
# Refit with all the data  for the plots
        
    ldaMod.priors = myPrior
    qdaMod.priors = myPrior
    Xrr = ldaMod.fit_transform(Xr, yGood)
    # Check labels
    for a, b in zip(classes, ldaMod.classes_):
        if a != b:
            print 'Error in ldaPlot: labels do not match'
  
    # Print the coefficients of first 3 DFA 
    print 'LDA Weights:'
    print 'DFA1:', ldaMod.coef_[0,:]
    if nClasses > 2:
        print 'DFA2:', ldaMod.coef_[1,:] 
    if nClasses > 3:
        print 'DFA3:', ldaMod.coef_[2,:] 
        
    # Obtain fits in this rotated space for display purposes   
    ldaMod.fit(Xrr, yGood)    
    qdaMod.fit(Xrr, yGood)
    rfMod.fit(Xrr, yGood)
    
    XrrMean = Xrr.mean(0)
                
    # Make a mesh for plotting
    x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1))
    xm1 = np.reshape(x1, -1)
    xm2 = np.reshape(x2, -1)
    nxm = np.size(xm1)
    Xm = np.zeros((nxm, Xrr.shape[1]))
    Xm[:,0] = xm1
    if Xrr.shape[1] > 1 :
        Xm[:,1] = xm2
        
    for ix in range(2,Xrr.shape[1]):
        Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix]
        
    XmcLDA = np.zeros((nxm, 4))  # RGBA values for color for LDA
    XmcQDA = np.zeros((nxm, 4))  # RGBA values for color for QDA
    XmcRF = np.zeros((nxm, 4))  # RGBA values for color for RF

    
    # Predict values on mesh for plotting based on the first two DFs     
    yPredLDA = ldaMod.predict_proba(Xm) 
    yPredQDA = qdaMod.predict_proba(Xm) 
    yPredRF = rfMod.predict_proba(Xm)

    
    # Transform the predictions in color codes
    maxLDA = yPredLDA.max()
    for ix in range(nxm) :
        cWeight = yPredLDA[ix,:]                               # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses
        XmcLDA[ix,:] = np.dot(cWinner, cClasses)
        XmcLDA[ix,3] = cWeight.max()/maxLDA
    
    # Plot the surface of probability    
    plt.figure(facecolor='white', figsize=(10,3))
    plt.subplot(131)
    Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: LDA pC %.0f %%' % (titleStr, (ldaScores.mean()*100.0)))
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))    
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')

    
    # Transform the predictions in color codes
    maxQDA = yPredQDA.max()
    for ix in range(nxm) :
        cWeight = yPredQDA[ix,:]                               # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses
        XmcQDA[ix,:] = np.dot(cWinner, cClasses)
        XmcQDA[ix,3] = cWeight.max()/maxQDA
    
    # Plot the surface of probability    
    plt.subplot(132)
    Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: QDA pC %.0f %%' % (titleStr, (qdaScores.mean()*100.0)))
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))
    
    
    # Transform the predictions in color codes
    maxRF = yPredRF.max()
    for ix in range(nxm) :
        cWeight = yPredRF[ix,:]           # Prob for all classes
        cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all 
        # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses  # Weighted colors does not work
        XmcRF[ix,:] = np.dot(cWinner, cClasses)
        XmcRF[ix,3] = cWeight.max()/maxRF
    
    # Plot the surface of probability    
    plt.subplot(133)
    Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4)
    plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto')
    if nClasses > 2:    
        plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1)
    else:
        plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) 
    plt.title('%s: RF pC %.0f %%' % (titleStr, (rfScores.mean()*100.0)))
    plt.xlabel('DFA 1')
    plt.ylabel('DFA 2')
    plt.axis('square')
    plt.xlim((-6, 6))
    plt.ylim((-6, 6))
    
    plt.show()


    # Results
    ldaScore = ldaScores.mean()*100.0
    qdaScore = qdaScores.mean()*100.0
    rfScore = rfScores.mean()*100.0
    ldaScoreSE = ldaScores.std() * 100.0
    qdaScoreSE = qdaScores.std() * 100.0 
    rfScoreSE = rfScores.std() * 100.0 
    
    print ("Number of classes %d. Chance level %.2f %%") % (nClasses, 100.0/nClasses)
    print ("%s LDA: %.2f (+/- %0.2f) %%") % (titleStr, ldaScore, ldaScoreSE)
    print ("%s QDA: %.2f (+/- %0.2f) %%") % (titleStr, qdaScore, qdaScoreSE)
    print ("%s RF: %.2f (+/- %0.2f) %%") % (titleStr, rfScore, rfScoreSE)
    return ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print(
        "\n\t\t------------------------------------------------------------------------------------------------------------------------\n"
    )
    print(
        "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n"
    )
    print(
        "\t\t------------------------------------------------------------------------------------------------------------------------\n"
    )

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles, comparisons and analysis
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    analysis_dir = os.path.join(options.workspace, "analysis")
    check_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)

    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)

        # Obtain all the results subfolders of the results main folder
        results_dir_list = [
            f for f in os.listdir(results_dir)
            if os.path.isdir(os.path.join(results_dir, f))
        ]

        for comparison in results_dir_list:

            drug_id1, drug_id2 = comparison.split('---')
            comparison_dir = os.path.join(results_dir, comparison)
            results_table = os.path.join(comparison_dir, 'results_table.tsv')

            # Add the Comb field (if it is drug combination or not)
            drug1 = drug_id1.split('_')[0].upper()
            drug2 = drug_id2.split('_')[0].upper()
            comparison_without_id = '{}---{}'.format(drug1, drug2)
            if comparison_without_id in pair2comb:
                combination_field = pair2comb[comparison_without_id]
            else:
                print(
                    'The comparison {} is not in the pair2comb dictionary!\n'.
                    format(comparison_without_id))
                print(pair2comb)
                sys.exit(10)

            if not fileExist(results_table):
                print('The comparison {} has not been executed properly!\n'.
                      format(comparison))
                sys.exit(10)

            results = get_results_from_table(results_table, columns,
                                             combination_field)

            df2 = pd.DataFrame([results], columns=columns, index=[comparison])
            # Add the information to the main data frame
            df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)

    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure': {'None': np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.
          format(num_dc))
    print(
        'Number of non-drug combinations after removing missing values:\t{}\n'.
        format(num_ndc))

    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir,
                                           'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir,
                                               'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(
            me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(
            df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs,
                     open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(
            open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[
                drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print(
        'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_dc))
    print(
        'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_ndc))

    #-------------------------------------#
    #   EVALUATE PERFORMANCE BY TARGETS   #
    #-------------------------------------#

    img_dir = os.path.join(analysis_dir, 'figures')
    create_directory(img_dir)
    fig_format = 'png'

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)

    # Number of targets
    num_targets = [1, 3, 5, 7, 9]

    # Names of the methods
    if consider_se:
        if options.different_atc:
            types_analysis = [
                'dctargets', 'dcguild', 'dcstructure', 'dcse', 'random'
            ]
            types_analysis2 = ['dctargets', 'dcguild', 'dcstructure',
                               'dcse']  # Without random!!
            #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcSE', 'Random']
            types_analysis_labels = [
                'Target', 'PPI', 'Structure', 'Side Effects', 'Random'
            ]
        else:
            types_analysis = [
                'dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse',
                'random'
            ]
            types_analysis2 = [
                'dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse'
            ]  # Without random!!
            #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcATC', 'dcSE', 'Random']
            types_analysis_labels = [
                'Target', 'PPI', 'Structure', 'ATC', 'Side Effects', 'Random'
            ]
    else:
        types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'random']
        types_analysis2 = ['dctargets', 'dcguild',
                           'dcstructure']  # Without random!!
        #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'Random']
        types_analysis_labels = ['Target', 'PPI', 'Structure', 'Random']

    # Machine learning parameters
    repetitions = 25  # Number of repetititons
    n_fold = 2  # Number of folds
    min_num_dc_group = 10
    greater_or_smaller = 'greater'
    classifier = 'SVC best 1'
    classifiers = {
        'KNeighbors':
        KNeighborsClassifier(3),
        'SVC':
        SVC(probability=True),
        'SVC linear':
        SVC(kernel="linear", C=0.025),
        'SVC rbf':
        SVC(gamma=2, C=1),
        'DecisionTree':
        DecisionTreeClassifier(max_depth=5),
        'RandomForest':
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        'MLP':
        MLPClassifier(alpha=1),
        'AdaBoost':
        AdaBoostClassifier(),
        'GaussianNB':
        GaussianNB(),
        'QuadraticDiscr.':
        QuadraticDiscriminantAnalysis(),
        'SVC best 1':
        SVC(kernel="rbf", gamma=0.01, C=100, probability=True),
        'SVC best 2':
        SVC(kernel="rbf", gamma=0.1, C=1.0, probability=True)
    }

    if options.pca:
        pca_str = '_withPCA'
    else:
        pca_str = '_withoutPCA'
    if options.different_atc:
        atc_str = '_diff_ATC'
    else:
        atc_str = ''

    # Plot of distributions of AUC
    plot_auc_distribution = os.path.join(
        img_dir,
        'numtargets_auc_distribution{}{}.{}'.format(atc_str, pca_str,
                                                    fig_format))

    # Plot of accuracy/sensitivity name
    acc_sens_dctargets = os.path.join(
        img_dir,
        'numtargets_accsens_dctargets{}{}.{}'.format(atc_str, pca_str,
                                                     fig_format))
    acc_sens_dcguild = os.path.join(
        img_dir,
        'numtargets_accsens_dcguild{}{}.{}'.format(atc_str, pca_str,
                                                   fig_format))
    acc_sens_dcstructure = os.path.join(
        img_dir,
        'numtargets_accsens_dcstructure{}{}.{}'.format(atc_str, pca_str,
                                                       fig_format))
    acc_sens_dcatc = os.path.join(
        img_dir,
        'numtargets_accsens_dcatc{}{}.{}'.format(atc_str, pca_str, fig_format))
    acc_sens_dcse = os.path.join(
        img_dir,
        'numtargets_accsens_dcse{}{}.{}'.format(atc_str, pca_str, fig_format))

    # Results table
    results_table = os.path.join(
        tables_dir, 'numtargets_auc_table{}{}.txt'.format(atc_str, pca_str))

    # Accuracy/Sensitivity results table
    prec_rec_table = os.path.join(
        tables_dir,
        'numtargets_accsens_table{}{}.txt'.format(atc_str, pca_str))

    # File with results of Mann Whitney tests
    mannwhitney_file = os.path.join(
        tables_dir, 'numtargets_mannwhitney{}{}.txt'.format(atc_str, pca_str))

    # Get the targets file
    drugbank_to_targets_file = os.path.join(toolbox_dir,
                                            'drugbank_to_targets.pcl')
    drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file))

    # Get the ATC file
    drugbank_to_atcs_file = os.path.join(toolbox_dir, 'drugbank_to_atcs.pcl')
    drugbank_to_atcs = cPickle.load(open(drugbank_to_atcs_file))

    # Get the DIANA IDs file
    diana_id_to_drugbank_file = os.path.join(toolbox_dir,
                                             'diana_id_to_drugbank.pcl')
    diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file))

    #-------------------------------------------------#
    #   SELECT DRUG COMBINATIONS WITH DIFFERENT ATC   #
    #-------------------------------------------------#

    if options.different_atc:

        selected_rows = []
        for index, row in df.iterrows():
            (drug_id1, drug_id2) = index.split('---')
            drug1 = diana_id_to_drugbank[drug_id1].upper()
            drug2 = diana_id_to_drugbank[drug_id2].upper()

            atcs_drug1 = set([atc[0] for atc in drugbank_to_atcs[drug1]])
            atcs_drug2 = set([atc[0] for atc in drugbank_to_atcs[drug2]])
            intersection = atcs_drug1 & atcs_drug2
            if len(intersection) == 0:
                selected_rows.append(index)

        df = df.ix[selected_rows]
        dc_data = df[df['combination'] == 1]
        ndc_data = df[df['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)
        print(
            'Num drug combinations after removing the ones with same ATC in training: {}'
            .format(num_dc))
        print(
            'Num non-drug combinations after removing the ones with same ATC in training: {}'
            .format(num_ndc))

    analysis_results = {
    }  # Defining the dictionary that will store the results

    if consider_se:
        dct_columns, dcg_columns, dcs_columns, dcatc_columns, dcse_columns = diana_analysis.obtain_method_to_columns(
            threshold_list, ATC_SE=consider_se)
    else:
        dct_columns, dcg_columns, dcs_columns = diana_analysis.obtain_method_to_columns(
            threshold_list, ATC_SE=consider_se)

    for num in num_targets:

        selected_rows = []

        for index, row in df.iterrows():

            (drug_id1, drug_id2) = index.split('---')
            drug1 = diana_id_to_drugbank[drug_id1].upper()
            drug2 = diana_id_to_drugbank[drug_id2].upper()

            if greater_or_smaller == 'greater':
                if len(drugbank_to_targets[drug1]) >= num and len(
                        drugbank_to_targets[drug2]) >= num:
                    selected_rows.append(index)
            elif greater_or_smaller == 'smaller':
                if len(drugbank_to_targets[drug1]) <= num and len(
                        drugbank_to_targets[drug2]) <= num:
                    selected_rows.append(index)
            else:
                print(
                    '\nERROR: Please, for the parameter greater_or_smaller, select \'greater\' or \'smaller\'\n'
                )
                sys.exit(10)

        df_tar = df.ix[selected_rows]

        if consider_se:
            if options.different_atc:
                list_methods = [['dctargets', dct_columns],
                                ['dcguild', dcg_columns],
                                ['dcstructure', dcs_columns],
                                ['dcse', dcse_columns], ['random', columns]]
            else:
                list_methods = [['dctargets', dct_columns],
                                ['dcguild', dcg_columns],
                                ['dcstructure', dcs_columns],
                                ['dcatc', dcatc_columns],
                                ['dcse', dcse_columns], ['random', columns]]
        else:
            list_methods = [['dctargets', dct_columns],
                            ['dcguild', dcg_columns],
                            ['dcstructure', dcs_columns], ['random', columns]]

        for method, columns_method in list_methods:

            print('Evaluating {} targets with method {}\n'.format(num, method))

            #------------------------------------------------------------------#
            #   SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA   #
            #------------------------------------------------------------------#

            if options.pca:

                variance_cut_off = 0.01
                num_components = 0
                df_method = df_tar[columns_method]
                df_raw = df_method.drop('combination', axis=1)
                raw_columns = copy.copy(columns_method)
                raw_columns.remove('combination')
                pca = PCA(n_components=None)
                pca.fit(df_raw)
                values_trans = pca.transform(df_raw)
                explained_variance = pca.explained_variance_ratio_
                for column, var in sorted(zip(raw_columns, explained_variance),
                                          key=lambda x: x[1],
                                          reverse=True):
                    #print(column, var)
                    if var > variance_cut_off:
                        num_components += 1

                if num_components < len(raw_columns):

                    print('Number of features:\t{}\n'.format(len(raw_columns)))
                    print(
                        'Reduction to {} components\n'.format(num_components))

                    pca = PCA(n_components=num_components)
                    pca.fit(df_raw)
                    values_trans = pca.transform(df_raw)
                    indexes = df_method.index.values
                    df_trans = pd.DataFrame.from_records(values_trans,
                                                         index=indexes)
                    df_comb = df_method[['combination']]
                    df_new = pd.concat([df_trans, df_comb], axis=1)
                    df_method = df_new

            else:

                # Manually introduced features
                guild_thresholds = [1, 5]
                rank_scoring = ['spearman', 'dot_product']
                list_scoring = ['jaccard']
                if method == 'Combination' or method == 'random':
                    selected_columns = diana_analysis.obtain_columns_best_features(
                        guild_thresholds,
                        rank_scoring,
                        list_scoring,
                        ATC_SE=consider_se)
                else:
                    selected_columns = diana_analysis.obtain_columns_best_features_for_specific_method(
                        method, guild_thresholds, rank_scoring, list_scoring)

                # Remove ATC columns if different ATC
                if options.different_atc and consider_se:
                    selected_columns = [
                        col for col in selected_columns
                        if col not in dcatc_columns or col == 'combination'
                    ]

                print('Selected columns: {}\n'.format(
                    ', '.join(selected_columns)))
                print('Number of selected features: {}\n'.format(
                    len(selected_columns) -
                    1))  # We take away the combinations column

                # Define the new table with the selected columns
                df_method = df_tar[selected_columns]
                dc_data = df_method[df_method['combination'] == 1]
                ndc_data = df_method[df_method['combination'] == 0]
                num_dc = len(dc_data.index)
                num_ndc = len(ndc_data.index)

            #------------------------------------------------------------------#

            dc_data = df_method[df_method['combination'] == 1]
            ndc_data = df_method[df_method['combination'] == 0]
            num_dc = len(dc_data.index)
            num_ndc = len(ndc_data.index)

            print(
                'Building {} repetition groups of {} (same) DC and {} (different) non-DC'
                .format(repetitions, num_dc, num_dc))
            ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(
                ndc_data, repetitions, num_dc
            )  # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

            mean_aucs = [
            ]  # Here we will store the means of AUCs from the cross-validations
            std_aucs = [
            ]  # Here we will store the standard deviations of the AUCs from the cross-validations
            all_aucs = []  # Here we will store ALL the AUCs
            all_probs = []  # Here we store all the probabilities and labels

            num_repetitions = 0
            for ndc_data_equal in ndc_repetitions:

                num_repetitions += 1
                num_items_group = int(
                    float(num_dc) / float(n_fold)
                )  # Calculate the number of items in each group of the cross-validation
                if num_repetitions == 1:
                    print(
                        'Building {} fold groups of {} DC and {} non-DC x {} repetitions'
                        .format(n_fold, num_items_group, num_items_group,
                                repetitions))

                dc_groups = diana_analysis.obtain_n_groups_of_k_length(
                    dc_data, n_fold, num_items_group, me_too_drug_combinations
                )  # Defining the drug combination groups in each cross-validation step
                ndc_groups = diana_analysis.obtain_n_groups_of_k_length(
                    ndc_data_equal, n_fold, num_items_group,
                    me_too_drug_combinations
                )  # Defining the non-drug combination groups in each cross-validation step
                merged_groups = [
                    pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups)
                ]

                if method == 'random':
                    #mean, var, std, list_auc = run_nfold_crossvalidation_random(n_fold, merged_groups, classifiers[classifier])
                    mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy(
                        n_fold, merged_groups, classifiers[classifier])
                else:
                    mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(
                        n_fold, merged_groups, classifiers[classifier])

                mean_aucs.append(mean)
                std_aucs.append(std)
                all_aucs = all_aucs + list_auc
                all_probs = all_probs + list_prob

            final_mean = np.mean(all_aucs)
            #final_mean = np.mean(mean_aucs)
            std = np.std(all_aucs)
            mean_std = np.mean(std_aucs)
            std_means = np.std(mean_aucs)
            print('FINAL MEAN: {}'.format(final_mean))
            print('STD: {}\n'.format(std))
            #print('MEAN of STD: {}'.format(mean_std))

            # Store the distribution of AUCs in the dictionary
            analysis_results.setdefault(num, {})
            analysis_results[num].setdefault(method, {})
            analysis_results[num][method]['all_aucs'] = all_aucs
            analysis_results[num][method]['all_probs'] = all_probs
            analysis_results[num][method]['mean'] = final_mean
            analysis_results[num][method]['std'] = std
            analysis_results[num][method]['num_dc'] = num_dc

    #------------------------------------#
    #   PLOT PRECISION VS. SENSITIVITY   #
    #------------------------------------#

    analysis_results = plot_precision_sensitivity(analysis_results,
                                                  'dctargets', num_targets,
                                                  acc_sens_dctargets)
    analysis_results = plot_precision_sensitivity(analysis_results, 'dcguild',
                                                  num_targets,
                                                  acc_sens_dcguild)
    analysis_results = plot_precision_sensitivity(analysis_results,
                                                  'dcstructure', num_targets,
                                                  acc_sens_dcstructure)
    if consider_se:
        if options.different_atc:
            analysis_results = plot_precision_sensitivity(
                analysis_results, 'dcse', num_targets, acc_sens_dcse)
        else:
            analysis_results = plot_precision_sensitivity(
                analysis_results, 'dcse', num_targets, acc_sens_dcse)
            analysis_results = plot_precision_sensitivity(
                analysis_results, 'dcatc', num_targets, acc_sens_dcatc)

    #----------------------------------------------------#
    #   PLOT DISTRIBUTION OF AUC PER NUMBER OF TARGETS   #
    #----------------------------------------------------#

    plot_auc_distributions(analysis_results,
                           num_targets,
                           types_analysis,
                           types_analysis_labels,
                           plot_auc_distribution,
                           fig_format=fig_format,
                           consider_se=consider_se,
                           different_atc=options.different_atc)
    #plot_violin(analysis_results, num_targets, types_analysis, types_analysis_labels, plot_auc_distribution, fig_format=fig_format, consider_se=consider_se)

    #--------------------------------------------------------#
    #   TABLE OF DISTRIBUTION OF AUC PER NUMBER OF TARGETS   #
    #--------------------------------------------------------#

    with open(results_table, 'w') as results_table_fd:

        # Header
        results_table_fd.write(' ')
        for method in types_analysis_labels:
            results_table_fd.write('\t{}\t \t '.format(method))
        results_table_fd.write('\n')

        for num in num_targets:
            results_table_fd.write('{}'.format(num))
            for method in types_analysis:
                mean = analysis_results[num][method]['mean']
                std = analysis_results[num][method]['std']
                num_dc = analysis_results[num][method]['num_dc']
                results_table_fd.write('\t{}\t{}\t{}'.format(
                    mean, std, num_dc))
            results_table_fd.write('\n')

    #----------------------------------------#
    #   TABLE OF PRECISION VS. SENSITIVITY   #
    #----------------------------------------#

    with open(prec_rec_table, 'w') as prec_rec_table_fd:

        # Header
        prec_rec_table_fd.write(' ')
        for method in types_analysis2:
            prec_rec_table_fd.write('\t{}\t '.format(method))
        prec_rec_table_fd.write('\n')

        for num in num_targets:
            prec_rec_table_fd.write('{}'.format(num))
            for method in types_analysis2:
                cut_off = analysis_results[num][method]['cut_off']
                value = analysis_results[num][method]['value']
                prec_rec_table_fd.write('\t{}\t{}'.format(cut_off, value))
            prec_rec_table_fd.write('\n')

    #-------------------------------------------------------------------#
    #   TABLE OF COMPARISON OF AUC DISTRIBUTIONS USING MANN WHITNEY U   #
    #-------------------------------------------------------------------#

    with open(mannwhitney_file, 'w') as mannwhitney_fd:

        mann_results = {}

        mannwhitney_fd.write(' \t ')
        for method in types_analysis_labels:
            mannwhitney_fd.write('\t{}'.format(method))
        mannwhitney_fd.write('\n')

        # Perform the comparisons
        for num in num_targets:
            mann_results.setdefault(num, {})
            for method1 in types_analysis:
                mann_results[num].setdefault(method1, {})
                for method2 in types_analysis:
                    if method1 == method2:
                        mann_results[num][method1][method2] = '-'
                    else:
                        method1_dist = analysis_results[num][method1][
                            'all_aucs']
                        method2_dist = analysis_results[num][method2][
                            'all_aucs']
                        stat, pval = scipy.stats.mannwhitneyu(
                            method1_dist, method2_dist)
                        mann_results[num][method1][method2] = [stat, pval]

        # Write the table of crossings
        for num in num_targets:
            for method1 in types_analysis:
                mannwhitney_fd.write('{}\t{}'.format(num, method1))
                for method2 in types_analysis:
                    if method1 == method2:
                        mannwhitney_fd.write('\t-')
                    else:
                        stat, pval = mann_results[num][method1][method2]
                        mannwhitney_fd.write('\t{}, {:.2e}'.format(stat, pval))
                mannwhitney_fd.write('\n')

    # End marker for time
    end = time.time()
    print(
        '\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'
        .format(end - start, (end - start) / 60))

    return
예제 #43
0
le = LabelEncoder()

# Using only petal length and width as features
X = dataset[:,2:4]
Y = dataset[:,-1]

# Encode class labels as numbers
Y = le.fit_transform(Y)

# Convert features to float as they are read as string 
X = X.astype(np.float)

# Initialising the classifiers
lda = LDA()
qda = QDA()

# Fit the data to the classifiers.
lda.fit(X, Y)
qda.fit(X, Y)

# Creating variables that takes value that cover range of the training data
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.03), np.arange(y_min, y_max, 0.01))

# Predict a value for the points obtained above
Z = lda.predict(np.c_[xx.ravel(), yy.ravel()])
Zq = qda.predict(np.c_[xx.ravel(), yy.ravel()])

Zq = Zq.reshape(xx.shape)
예제 #44
0
    clf.score(X_test, y_test)))

#%% Test with QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

# Splits the data into a training set and randomized test set with accompanying labels
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    classes,
                                                    test_size=0.2)

# Scales the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
2
clf = QDA()
clf.fit(X_train, y_train)
print('Accuracy of QDA classifier on training set: {:.2f}'.format(
    clf.score(X_train, y_train)))
print('Accuracy of QDA classifier on test set: {:.2f}'.format(
    clf.score(X_test, y_test)))

#%% Test with Naive Bayes
from sklearn.naive_bayes import GaussianNB as GNB
# Splits the data into a training set and randomized test set with accompanying labels
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    classes,
                                                    test_size=0.2)

# Scales the data
sc = StandardScaler()
예제 #45
0
as_4 = 0.0
as_5 = 0.0

max_value = 300

for random_state in np.arange(1, max_value):
	# Confusion Matrix
	# First, split data in training and validation sets
	X_train, X_test, y_train, y_test = train_test_split(X, class_label, random_state=random_state)

	# Initialize Classificators
	clf_1 = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
	clf_2 = AdaBoostClassifier(n_estimators=100)
	clf_3 = linear_model.LinearRegression()
	clf_4 = LinearDiscriminantAnalysis()
	clf_5 = QuadraticDiscriminantAnalysis()

	# Test classificator with training and validation data
	y_pred_1 = clf_1.fit(X_train, y_train).predict(X_test)
	y_pred_2 = clf_2.fit(X_train, y_train).predict(X_test)
	y_pred_3 = clf_3.fit(X_train, y_train).predict(X_test)
	y_pred_4 = clf_4.fit(X_train, y_train).predict(X_test)
	y_pred_5 = clf_5.fit(X_train, y_train).predict(X_test)

	'''
	Z_1 = clf_1.decision_function(np.c_[xx.ravel(), yy.ravel()])
	Z_2 = clf_2.decision_function(np.c_[xx.ravel(), yy.ravel()])
	Z_3 = clf_3.decision_function(np.c_[xx.ravel(), yy.ravel()])
	Z_3 = Z_3.reshape(xx.shape)
	Z_4 = clf_4.decision_function(np.c_[xx.ravel(), yy.ravel()])
	Z_5 = clf_5.decision_function(np.c_[xx.ravel(), yy.ravel()])
def test_qda_regularization():
    # the default is reg_param=0. and will cause issues
    # when there is a constant variable
    clf = QuadraticDiscriminantAnalysis()
    with ignore_warnings():
        y_pred = clf.fit(X2, y6).predict(X2)
    assert np.any(y_pred != y6)

    # adding a little regularization fixes the problem
    clf = QuadraticDiscriminantAnalysis(reg_param=0.01)
    with ignore_warnings():
        clf.fit(X2, y6)
    y_pred = clf.predict(X2)
    assert_array_equal(y_pred, y6)

    # Case n_samples_in_a_class < n_features
    clf = QuadraticDiscriminantAnalysis(reg_param=0.1)
    with ignore_warnings():
        clf.fit(X5, y5)
    y_pred5 = clf.predict(X5)
    assert_array_equal(y_pred5, y5)
예제 #47
0
class road_estimation:
    def __init__(self, model_selection):
        self._train_data, self._train_targets, self._valid_data, self._valid_targets, self._test_data, self._test_targets = (
            data_load()
        )

        self._model_selection = model_selection
        self._classifier = []

    def train(self):
        if self._model_selection == "svm":
            # selected the svc in svm
            self._classifier = svm.SVC()
        elif self._model_selection == "nb":
            self._classifier = GaussianNB()
        elif self._model_selection == "knn":
            # parameter n_jobs can be set to -1 to enable parallel calculating
            self._classifier = KNeighborsClassifier(n_neighbors=7)
        elif self._model_selection == "ada":
            # Bunch of parameters, n_estimators, learning_rate
            self._classifier = AdaBoostClassifier()
        elif self._model_selection == "rf":
            # many parameters including n_jobs
            self._classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
        elif self._model_selection == "qda":
            # complicated array like parameters, perhaps leave it default
            self._classifier = QuadraticDiscriminantAnalysis()
        else:
            print "Please refer to one classifier"

        self._classifier.fit(self._train_data, self._train_targets)
        # predict on valid data
        prediction_valid = self._classifier.predict(self._valid_data)
        # print validation result for selected model.
        print (
            "Classification report for classifier %s on valid_data:\n%s\n"
            % (self._model_selection, metrics.classification_report(self._valid_targets, prediction_valid))
        )

    def test(self):
        # predict on test data
        prediction_test = self._classifier.predict(self.test_data)
        # print test result for selected model.
        print (
            "Classification report for classifier %s on test_data:\n%s\n"
            % (self._model_selection, metrics.classification_report(self._test_targets, prediction_test))
        )

    def showPredictionImage(self):
        f = Feature()
        f.loadImage("um_000000.png")
        f.extractFeatures()
        fea_matrix = f.getFeaturesVectors()

        predict = self._classifier.predict(fea_matrix)
        image = np.copy(f.image)

        num_superpixels = np.max(f.superpixel) + 1
        for i in xrange(0, num_superpixels):
            indices = np.where(f.superpixel == i)
            if predict[i] == 1:

                image[indices[0], indices[1], 0] = 1
                image[indices[0], indices[1], 1] = 1
                image[indices[0], indices[1], 2] = 0
        plt.imshow(image)
        plt.show()
        # show prediction image with superpixels
        plt.imshow(mark_boundaries(image, superpixels))
        plt.show()
    def fit(self, _X_train, _y_train, _X_test, _y_test):
        if 'Statement' in _X_train.columns and 'Statement' in _X_test.columns:
            X_train = _X_train.copy()
            X_test = _X_test.copy()
        elif 'Statement' in _X_train.columns:
            print('Statement column not found in the testing data.')
            return
        elif 'Statement' in _X_test.columns:
            print('Statement column not found in the training data.')
            return

        X_train = self.extract(X_train)
        X_test = self.extract(X_test)

        y_train = _y_train.replace('pants-fire', 0).replace(
            'false', 1).replace('barely-true',
                                2).replace('half-true',
                                           3).replace('mostly-true',
                                                      4).replace('true', 5)
        y_test = _y_test.replace('pants-fire', 0).replace('false', 1).replace(
            'barely-true', 2).replace('half-true',
                                      3).replace('mostly-true',
                                                 4).replace('true', 5)

        names = [
            "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
            "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA"
        ]

        classifiers = [
            KNeighborsClassifier(2),
            SVC(kernel="linear", C=0.025, probability=True, random_state=0),
            SVC(gamma=2, C=1, probability=True, random_state=0),
            DecisionTreeClassifier(max_depth=5, random_state=0),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=10,
                                   max_features=1,
                                   random_state=0),
            MLPClassifier(alpha=1, max_iter=1000, random_state=0),
            AdaBoostClassifier(random_state=0),
            GaussianNB(),
            QuadraticDiscriminantAnalysis()
        ]

        max_score = 0.0
        max_class = ''
        for name, clf in zip(names, classifiers):
            start_time = time()
            clf.fit(X_train, y_train)
            score = 100.0 * clf.score(X_test, y_test)
            print(
                'Classifier = %s, Score (test, accuracy) = %.2f,' %
                (name, score),
                'Training time = %.2f seconds' % (time() - start_time))

            if score > max_score:
                self.clf = clf
                max_score = score
                max_class = name

        print(80 * '-')
        print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %
              (max_class, max_score))
예제 #49
0
파일: LDA.py 프로젝트: flyxu/scikit-learn
    splot.add_artist(ell)
    splot.set_xticks(())
    splot.set_yticks(())


def plot_lda_cov(lda, splot):
    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')


def plot_qda_cov(qda, splot):
    plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')
    plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')

###############################################################################
for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
    # Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis(store_covariances=True)
    y_pred = qda.fit(X, y).predict(X)
    splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis')
plt.show()
예제 #50
0
def main():
    df = pandas.read_csv(sys.argv[1])
    generateSummary(df)
    df.columns = colNames
    # Means with NaNs
    means_with_nan = df.groupby(
        colNames[colNums[4]]).apply(lambda x: x.mean())
    means_with_nan = means_with_nan.append(pandas.DataFrame(
        means_with_nan.mean(numeric_only=True)).T)
    means_with_nan.rename(index={0: 'mean'}, inplace=True)
    print("\nMeans with NaN:")
    print(means_with_nan)
    means_with_nan = means_with_nan.values.flatten()
    # Means without NaNs
    means_without_nan = df.dropna().groupby(
        colNames[colNums[4]]).apply(lambda x: x.mean())
    means_without_nan = means_without_nan.append(pandas.DataFrame(
        means_without_nan.mean(numeric_only=True)).T)
    means_without_nan.rename(index={0: 'mean'}, inplace=True)
    print("\nMeans without NaN:")
    print(means_without_nan)
    means_without_nan = means_without_nan.values.flatten()
    # MCAR decision
    cs = chisquare(means_with_nan, means_without_nan)
    print("\nChi-square of (means_with_nan,means_without_nan) p-value:", cs.pvalue)
    if(cs.pvalue > 0.05):
        # CMAR - replace NaN by class mean
        print("P-value was not significant, data is MCAR. Imputing missing values")
        df[colNames[colNums[0]]] = df.groupby(colNames[colNums[4]])[
            colNames[colNums[0]]].apply(lambda x: x.fillna(x.mean()))
        df[colNames[colNums[1]]] = df.groupby(colNames[colNums[4]])[
            colNames[colNums[1]]].apply(lambda x: x.fillna(x.mean()))
        df[colNames[colNums[2]]] = df.groupby(colNames[colNums[4]])[
            colNames[colNums[2]]].apply(lambda x: x.fillna(x.mean()))
        df[colNames[colNums[3]]] = df.groupby(colNames[colNums[4]])[
            colNames[colNums[3]]].apply(lambda x: x.fillna(x.mean()))
        print("\nMeans after imputation:")
        means_after_impute = df.groupby(
            colNames[colNums[4]]).apply(lambda x: x.mean())
        means_after_impute = means_after_impute.append(
            pandas.DataFrame(means_after_impute.mean(numeric_only=True)).T)
        means_after_impute.rename(index={0: 'mean'}, inplace=True)
        print(means_after_impute)
    else:
        # NMAR - drop rows with NaN
        print("P-value was significant, data is NMAR. Dropping rows with missing values")
        df.dropna(inplace=True)
    # Drop highly correlated control variables
    corr_matrix = df.corr().abs()
    upper_triangle = corr_matrix.where(numpy.triu(
        numpy.ones(corr_matrix.shape), k=1).astype(numpy.bool))
    print("\nCalculating correlation between control variables:")
    print(upper_triangle)
    to_drop = [column for column in upper_triangle.columns if any(
        upper_triangle[column] > 0.95)]
    print("\nDropping variable with high multicollinearity: ", to_drop)
    df = df.drop(to_drop, axis=1)
    for i, x in enumerate(colNames):
        if x in to_drop:
            colNames.remove(x)
            colNums.remove(len(colNums)-1)
    # Create A/B split of data
    array = df.values
    X = array[:, 0:3]
    Y = array[:, 3]
    validation_size = 0.20
    seed = 1
    X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
        X, Y, test_size=validation_size, random_state=seed)
    # Select variables to predict response
    chi2_selector = SelectKBest(score_func=chi2, k=2)
    chi2_selector.fit(X_train, Y_train)
    chi2_selector.fit_transform(X_train, Y_train)
    print("\nFeature Importance (Chi^2): " + str(chi2_selector.scores_))
    feature_names = df.iloc[:, chi2_selector.get_support()].columns.values
    print("Selecting features: " + str(feature_names))
    X = chi2_selector.transform(X)
    X_train = chi2_selector.transform(X_train)
    X_validation = chi2_selector.transform(X_validation)
    # Instiantiate model list
    scoring = 'accuracy'
    models = []
    models.append(('LR', LogisticRegression(
        solver='liblinear', multi_class='ovr')))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('MLP', MLPClassifier(solver='lbfgs')))
    models.append(('RFC', RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1)))
    models.append(('GPC', GaussianProcessClassifier(1.0 * RBF(1.0))))
    models.append(('ABC', AdaBoostClassifier()))
    models.append(('QDA', QuadraticDiscriminantAnalysis()))
    models.append(('SDG', SGDClassifier(max_iter=1000, tol=0.05)))
    models.append(('GBC', GradientBoostingClassifier()))
    models.append(('NSVC', NuSVC(probability=True, gamma='auto')))
    # evaluate each model in turn
    results = []
    names = []
    print("\nEvaluating classifiers:")
    print("name,accuracy,std_dev")
    classifier_performance_dict = {}
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(
            model, X_train, Y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s,%f,%f" % (name, cv_results.mean(), cv_results.std())
        classifier_performance_dict[name] = cv_results.mean()
        print(msg)
    # Combine classifiers using a vote
    classifier = VotingClassifier(models)
    classifier.fit(X_train, Y_train)
    results = model_selection.cross_val_score(
        classifier, X_train, Y_train, cv=kfold)
    # Execute some tests
    predictions = classifier.predict(X_validation)
    joined_testdata = numpy.concatenate(
        (X_validation, numpy.reshape(Y_validation, (-1, 1))), axis=1)
    joined_testdata_w_predictions = numpy.concatenate(
        (joined_testdata, numpy.reshape(predictions, (-1, 1))), axis=1)
    print("\nEnselble (vote) classifier validation test results:")
    print(feature_names[0]+","+feature_names[1]+",real-class,predicted-class")
    for row in joined_testdata_w_predictions:
        if row[2] != row[3]:
            print(row, " <== Misclassified")
        else:
            print(row)
    print("Accuracy: " + str(accuracy_score(Y_validation, predictions)))
    print(classification_report(Y_validation, predictions))
    # Visualise results
    colors = {'virginica': 'red',
              'setosa': 'blue', 'versicolor': 'green'}
    plt.scatter(df[feature_names[0]], df[feature_names[1]],
                c=df[colNames[colNums[3]]].apply(lambda x: colors[x]))
    for row in joined_testdata_w_predictions:
        if row[2] != row[3]:
            plt.scatter(row[0], row[1], c='black', marker='x')
    plt.xlabel(feature_names[0])
    plt.ylabel(feature_names[1])
    plt.title("Iris dataset")
    plt.show()

logreg = LogisticRegression().fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)
log_acc =  accuracy_score(y_pred, y_test) #0.64 highest

clf = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf_acc =  accuracy_score(y_pred, y_test) #0.61 

neigh = KNeighborsClassifier(n_neighbors=13).fit(X_train, y_train)
y_pred = neigh.predict(X_test)
nn_acc =  accuracy_score(y_pred, y_test) #0.61

quad = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
y_pred = quad.predict(X_test)
quad_acc = accuracy_score(y_pred, y_test) # 0.19 very low

ldaC = LDA(solver='lsqr', shrinkage='auto').fit(X_train, y_train) #LDA with shrinkage
y_pred = ldaC.predict(X_test)
lda_acc = accuracy_score(y_pred, y_test) #0.58

#########################################
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
import matplotlib.pyplot as plt

def calc_params(X, y, clf, param_values, param_name, K, metric = 'accuracy'):
    '''This function takes the classfier, the training data and labels, the name of the
    parameter to vary, a list of values to vary by, and a number of folds needed for 
    '_delta_sum.npy', '_abspwr.npy', '_abspwr_sum.npy', '_cog.npy', '_spec.npy'
]
""" -----------0 ------------------1 ------------2--------------3-----------4---------5---------------6--------------7-----------8--------------9-----------------10------------11-------------12---------------13"""
p = 1
knn = 5
#clf = KNeighborsClassifier(n_neighbors=knn)

#clf = KNeighborsClassifier(n_neighbors=3)
#
#clf = KNeighborsClassifier(n_neighbors=7)
#
#clf = svm.SVC(kernel = 'rbf', C =3.0, decision_function_shape='ovr')
#
#clf = naive_bayes.GaussianNB()
#
clf = QuadraticDiscriminantAnalysis()
#
#clf = svm.SVC(kernel = 'poly', C =1.0, degree = 5, decision_function_shape='ovr')

for i in range(0, x):
    h["happy{0}".format(i)] = np.load(s1_happy[i] + feature[p])
# Get Sad
for j in range(0, y):
    c["calm{0}".format(j)] = np.load(s1_calm[j] + feature[p])

for k in range(0, z):
    a["angry{0}".format(k)] = np.load(s1_angry[k] + feature[p])
"""-----------------Stack the Data ---------------------------------------------"""

train = np.vstack((a['angry0'], a['angry1']))
def train_QDA(X_train, Y_train, X_test, reg_param):
    clf_Quad = QuadraticDiscriminantAnalysis(reg_param=reg_param)
    clf_Quad.fit(X_train, Y_train)
    Y_predict = clf_Quad.predict(X_test)
    return Y_predict
예제 #54
0
lda = LinearDiscriminantAnalysis()
lda.fit(kbest_train_numpy, y_train)
#lda.fit(x_train, y_train)
#myprediction1 =  lda.predict(x_test)
predict_1 = lda.predict(kbest_test_numpy)

#Use score to get the accuracy of the model
print("Linear Discriminant Analysis Accuracy...")
# score = lda.score(x_test, y_test)
score = accuracy_score(y_test, predict_1)

print(score * 100, "%")
##################################################################
#Quadratic Discriminant Analysis
print("Quadratic Discriminant Analysis")
qda = QuadraticDiscriminantAnalysis()
qda.fit(kbest_train_numpy, y_train)
# qda.fit(x_train, y_train)
#predict_2 = qda.predict(x_test)
predict_2 = qda.predict(kbest_test_numpy)
# print myprediction2

#Use score to get the accuracy of the model
print("Quadratic Discriminant Analysis Accuracy...")
# score = qda.score(x_test, y_test)
score = accuracy_score(y_test, predict_2)
print(score * 100, "%")

##################################################################
#Logistic Regression
print("Logistic Regression")
예제 #55
0
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#smoteen
sme = SMOTEENN(random_state=42)
os_X,os_y = sme.fit_sample(X_train,y_train)

#QDA
clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True)
clf_QDA.fit(os_X, os_y)
y_true, y_pred = y_test, clf_QDA.predict(X_test)

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1])
specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) 
print "G score: " , math.sqrt(recall/ specifity) 
newdf.drop('protocol_type', axis=1, inplace=True)
newdf.drop('service', axis=1, inplace=True)
newdf_test=df_test.join(testdf_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)

print(newdf_test['label'].value_counts())

features = newdf[final_columns].astype(float)
features1 = newdf_test[final_columns].astype(float)
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis()
t0 = time()
clf.fit(features, lab)
tt = time() - t0
print ("Classifier trained in {} seconds.".format(round(tt, 3)))
t0 = time()
pred = clf.predict(features1)
tt = time() - t0
print ("Predicted in {} seconds".format(round(tt,3)))
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, lab1)
print ("Accuracy is {}.".format(round(acc,4)))
print(pd.crosstab(lab1, pred, rownames=['Actual attacks'], colnames=['Predicted attacks']))

#Classifier trained in 4.315 seconds.
#Predicted in 0.349 seconds
예제 #57
0
    labels = []
    
    for i in range(0,9):
        labels.append(1)
    for i in range(9,18):
        labels.append(2)
    for i in range(18, 27):
        labels.append(3)
    '''
    # Creation of random labels
    for i in range(0,27):
        labels.append(int(random.random() * 3) + 1)
    print (labels)
    '''
    # QDA model
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(comps, labels)

    # MCC Calculation
    y_pred = qda.predict(comps)
    #print(labels)
    #print(y_pred)
    mcc = multimcc(labels,y_pred)
    print("MCC="+str(mcc))

    '''
    # Plotting QDA contour
    nx, ny = 200, 100
    x_min, x_max = np.amin(comps[:,0]), np.amax(comps[:,0])
    y_min, y_max = np.amin(comps[:,1]), np.amax(comps[:,1])
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny))
예제 #58
0
def QuadDA(X_train, y_train, X_test, y_test):
    clf = QDA()
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    return accuracy
예제 #59
0
def vis(short_patient, thresh_file=None):
    """
    Visualize patient classification results for different classifiers

    :param short_patient: Patient to visualize
    :type short_patient: str
    :param thresh_file: Optional, txt file containing thresh data
    :type thresh_file: str
    :returns None, saves output in the format short_patient + '_{0}_pr_with_ML_and_pose'.format(emotion)
    """

    if not thresh_file:
        thresh_file = short_patient + '_threshes.txt'
    thresh_dict = json.load(
        open(thresh_file)) if os.path.exists(thresh_file) else {}

    if not thresh_dict:
        out_q = multiprocessing.Manager().Queue()
        threshes = np.linspace(0, 1.5, 100)
        bar = ProgressBar(max_value=len(threshes))
        f = functools.partial(thresh_calc, out_q, short_patient)

        for i, _ in enumerate(Pool().imap(f, threshes, chunksize=10)):
            while not out_q.empty():
                thresh_dict.update(out_q.get())
            bar.update(i)
        json.dump(thresh_dict, open(thresh_file, 'w'))

    for emotion in ['Happy', 'Angry', 'Sad', 'Disgust']:
        # precision-recall
        out_vals = {}

        for thresh in sorted(thresh_dict.keys()):
            if emotion in thresh_dict[thresh]:
                curr_emote_dict = thresh_dict[thresh][emotion]
                false_pos = curr_emote_dict['false_pos']
                true_pos = curr_emote_dict['true_pos']
                false_neg = curr_emote_dict['false_neg']
                total_pos = true_pos + false_neg

                if total_pos and (false_pos + true_pos):
                    precision = true_pos / (false_pos + true_pos)
                    recall = true_pos / total_pos
                    out_vals[thresh] = [precision, recall]
        x_vals = [out_vals[thresh][0] for thresh in sorted(out_vals.keys())]
        y_vals = [out_vals[thresh][1] for thresh in sorted(out_vals.keys())]
        z_vals = [float(x) for x in sorted(out_vals.keys())]

        if x_vals and y_vals and len(x_vals) == len(y_vals):
            fig = plt.figure()
            ax = fig.gca()
            ax.plot(x_vals, y_vals, label='Substring')

            OpenDir = sys.argv[sys.argv.index('-d') + 1]
            os.chdir(OpenDir)
            au_train, au_test, target_train, target_test = make_emotion_data(
                emotion, short_patient)

            classifier_dict = {
                KNeighborsClassifier():
                'KNeighbors',
                SVC(kernel='linear', probability=True):
                'SVCLinear',
                SVC(probability=True):
                'SVC',
                # GaussianProcessClassifier(),
                # DecisionTreeClassifier(),
                RandomForestClassifier():
                'RandomForest',
                ExtraTreesClassifier():
                'ExtraTrees',
                MLPClassifier():
                'MLP',
                AdaBoostClassifier():
                'AdaBoost',
                GaussianNB():
                'GaussianNB',
                QuadraticDiscriminantAnalysis():
                'QuadraticDiscriminantAnalysis',
                BernoulliNB():
                'BernoulliNB'
            }

            for classifier in classifier_dict.keys():
                expected, decision_function = use_classifier(
                    classifier, au_train, au_test, target_train, target_test)
                precision, recall, thresholds = precision_recall_curve(
                    expected, decision_function)
                ax.plot(precision, recall, label=classifier_dict[classifier])

            ax.set_title('Performance of Different Methods for' + "\' " +
                         emotion + " \'" + 'Recognition from Continuous AUs')
            ax.set_xlabel('Precision')
            ax.set_ylabel('Recall')
            ax.legend()
            fig.tight_layout()
            plt.savefig(short_patient +
                        '_{0}_pr_with_ML_and_pose'.format(emotion))
            plt.close()
예제 #60
0
def MyQDA():
    return QuadraticDiscriminantAnalysis()