예제 #1
0
    def test_all_methods(self):
        x_cols = ["Lag2"]
        formula = "Direction~Lag2"
        # print self.df.shape[0]
        train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :]
        # print train_data.shape[0]
        """ (d) logistic"""
        model = smf.glm(formula, data=train_data, family=sm.families.Binomial())
        result = model.fit()
        test_data = self.df.ix[self.df["Year"] > 2008, :]
        probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]])))
        pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up")
        tp.output_table(pred_values.values, test_data[self.y_col].values)

        train_X = train_data[x_cols].values
        train_y = train_data[self.y_col].values
        test_X = test_data[x_cols].values
        test_y = test_data[self.y_col].values
        """ (e) LDA """
        lda_res = LDA().fit(train_X, train_y)
        pred_y = lda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (f) QDA """
        qda_res = QDA().fit(train_X, train_y)
        pred_y = qda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (g) KNN """
        clf = neighbors.KNeighborsClassifier(1, weights="uniform")
        clf.fit(train_X, train_y)
        pred_y = clf.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (h) logistic and LDA """
        """ (i) Is the purpose of the last question going through all methods with no direction?"""
예제 #2
0
class SNPForecastingStrategy(Strategy):
	def __init__(self,symbol,bars):
		self.symbol=symbol
		self.bars=bars
		self.create_periods()
		self.fit_model()

	def create_periods(self):
		self.start_train=datetime.datetime(2001,1,10)
		self.start_test=datetime.datetime(2005,1,1)
		self.end_period=datetime.datetime(2005,12,31)

	def fit_model(self):
		snpret=create_lagged_series(self.symbol,self.start_train,self.end_period,lags=5)
		X=snpret[['Lag1','Lag2']]
		Y=snpret['Direction']
		X_train=X[X.index<self.start_test]
		Y_train=Y[Y.index<self.start_test]
		self.predictors=X[X.index>=self.start_test]
		self.model=QDA()
		self.model.fit(X_train,Y_train)

	def generate_signals(self):
		signals=pd.DataFrame(index=self.bars.index)
		signals['signal']=0.0
		signals['signal']=self.model.predict(self.predictors)
		signals['signal'][0:5]=0.0
		signals['positions']=signals['signal'].diff()
		return signals
예제 #3
0
class RegularizedQDA:
  """
    Three types of regularization are possible:
    - regularized the covariance of a class toward the 
      average variance within that class
    - regularize the covariance of a class toward the
      pooled covariance across all classes
    - add some constant amount of variance to each feature
  """
  def __init__(self, avg_weight = 0.1, pooled_weight = 0, extra_variance = 0):
    self.avg_weight = avg_weight
    self.pooled_weight = pooled_weight
    self.extra_variance = extra_variance 
    self.model = QDA()
    
  def fit(self, X, Y):
    self.model.fit(X,Y)
    I = np.eye(X.shape[1])
    a = self.avg_weight
    p = self.pooled_weight
    ev = self.extra_variance 
    original_weight = 1.0 - a - p
    scaled_pooled_cov = p * np.cov(X.T)
    assert scaled_pooled_cov.shape == I.shape
    assert all([C.shape == I.shape for C in self.model.rotations])
    self.model.rotations = \
      [original_weight * C + \
       a * np.mean(np.diag(C)) * I + \
       scaled_pooled_cov + ev * I \
       for C in self.model.rotations] 
      
  def predict(self, X):
    return self.model.predict(X)
예제 #4
0
def QDA_onNonDynamicData():
    #Parsing Full training dataset
    XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt')
    YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt')

    #Parsing Full testing dataset
    XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt')
    YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt')

    #Getting the dataset associated with Non-Dynamic Activities on training
    X_NonDynamic, Y_NonDynamic = common.getDataSubset(XFull, YFull.flatten(),
                                                      [4, 5, 6])
    #Getting the dataset associated with Non-Dynamic Activities on testing
    X_NonDynamicTest, Y_NonDynamicTest = common.getDataSubset(
        XFullTest, YFullTest.flatten(), [4, 5, 6])

    #Fitting data using QDA classifier

    clf = QDA()
    clf.fit(X_NonDynamic, Y_NonDynamic.flatten())

    precision, recall, fscore = common.checkAccuracy(
        clf.predict(X_NonDynamicTest), Y_NonDynamicTest, [4, 5, 6])
    common.createConfusionMatrix(
        clf.predict(X_NonDynamicTest).flatten(), Y_NonDynamicTest.flatten(),
        [4, 5, 6])
    print fscore

    #Getting the dataset associated with Dynamic Activities on training
    X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(),
                                                [1, 2, 3])
    #Getting the dataset associated with Dynamic Activities on testing
    X_DynamicTest, Y_DynamicTest = common.getDataSubset(
        XFullTest, YFullTest.flatten(), [1, 2, 3])
    print len(X_DynamicTest), len(Y_DynamicTest)

    #Fitting data using QDA classifier
    clf = QDA()
    clf.fit(X_Dynamic, Y_Dynamic.flatten())

    precision, recall, fscore = common.checkAccuracy(
        clf.predict(X_DynamicTest), Y_DynamicTest, [1, 2, 3])
    common.createConfusionMatrix(
        clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(),
        [1, 2, 3])

    print fscore
예제 #5
0
파일: snp.py 프로젝트: fagan2888/PyStuff
class SNPForecastingStrategy(Strategy):
    """    
    Requires:
    symbol - A stock symbol on which to form a strategy on.
    bars - A DataFrame of bars for the above symbol."""

    def __init__(self, symbol, bars):
        self.symbol = symbol
        self.bars = bars
        self.create_periods()
        self.fit_model()

    def create_periods(self):
        """Create training/test periods."""
        self.start_train = datetime.datetime(2001,1,10)
        self.start_test = datetime.datetime(2005,1,1)
        self.end_period = datetime.datetime(2005,12,31)

    def fit_model(self):
        """Fits a Quadratic Discriminant Analyser to the
        US stock market index (^GPSC in Yahoo)."""
        # Create a lagged series of the S&P500 US stock market index
        snpret = create_lagged_series(self.symbol, self.start_train, 
                                      self.end_period, lags=5) 

        # Use the prior two days of returns as 
        # predictor values, with direction as the response
        X = snpret[["Lag1","Lag2"]]
        y = snpret["Direction"]

        # Create training and test sets
        X_train = X[X.index < self.start_test]
        y_train = y[y.index < self.start_test]

        # Create the predicting factors for use 
        # in direction forecasting
        self.predictors = X[X.index >= self.start_test]

        # Create the Quadratic Discriminant Analysis model
        # and the forecasting strategy
        self.model = QDA()
        self.model.fit(X_train, y_train)

    def generate_signals(self):
        """Returns the DataFrame of symbols containing the signals
        to go long, short or hold (1, -1 or 0)."""
        signals = pd.DataFrame(index=self.bars.index)
        signals['signal'] = 0.0       

        # Predict the subsequent period with the QDA model
        signals['signal'] = self.model.predict(self.predictors)

        # Remove the first five signal entries to eliminate
        # NaN issues with the signals DataFrame
        signals['signal'][0:5] = 0.0
        signals['positions'] = signals['signal'].diff() 

        return signals
def qda(data,labels,n,v_type):
	train_data,train_labels,test_data,test_labels = split_data(data,labels,v_type)

	clf = QDA()
	clf.fit(train_data, train_labels)
	y_pred = clf.predict(test_data)
	pure_accuracy_rate = len([y_pred[x] for x in range(len(y_pred)) if y_pred[x] == test_labels[x]])/float(len(test_labels))
	report = classification_report(y_pred, test_labels, target_names=rock_names)
	cm = confusion_matrix(test_labels, y_pred)
	return pure_accuracy_rate,report,y_pred,test_labels,test_data,clf,cm,"QDA"
예제 #7
0
def QDA_onNonDynamicData():
    #Parsing Full training dataset
    XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt')
    YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt')

    #Parsing Full testing dataset
    XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt')
    YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt')

    #Getting the dataset associated with Non-Dynamic Activities on training 
    X_NonDynamic,Y_NonDynamic = common.getDataSubset(XFull,YFull.flatten(),[4,5,6])
    #Getting the dataset associated with Non-Dynamic Activities on testing
    X_NonDynamicTest,Y_NonDynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[4,5,6])

    #Fitting data using QDA classifier

    clf = QDA()
    clf.fit(X_NonDynamic, Y_NonDynamic.flatten())

    precision,recall,fscore = common.checkAccuracy(clf.predict(X_NonDynamicTest),Y_NonDynamicTest,[4,5,6])
    common.createConfusionMatrix(clf.predict(X_NonDynamicTest).flatten(),Y_NonDynamicTest.flatten(),[4,5,6])
    print fscore

    #Getting the dataset associated with Dynamic Activities on training 
    X_Dynamic,Y_Dynamic = common.getDataSubset(XFull,YFull.flatten(),[1,2,3])
    #Getting the dataset associated with Dynamic Activities on testing
    X_DynamicTest,Y_DynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[1,2,3])
    print len(X_DynamicTest),len(Y_DynamicTest)

    #Fitting data using QDA classifier
    clf = QDA()
    clf.fit(X_Dynamic, Y_Dynamic.flatten())

    precision,recall,fscore = common.checkAccuracy(clf.predict(X_DynamicTest),Y_DynamicTest,[1,2,3])
    common.createConfusionMatrix(clf.predict(X_DynamicTest).flatten(),Y_DynamicTest.flatten(),[1,2,3])

    print fscore
def QDA_onFullDataset():
    #Parsing Full training dataset
    XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt')
    YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt')

    #Parsing Full testing dataset
    XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt')
    YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt')

    #Fitting data using QDA classifier
    clf = QDA()
    clf.fit(XFull, YFull.flatten())

    #Testing the results
    precision,recall,fscore = common.checkAccuracy(clf.predict(XFullTest),YFullTest,[1,2,3,4,5,6])
    print(fscore)
예제 #9
0
def QDA_onFullDataset():
    #Parsing Full training dataset
    XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt')
    YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt')

    #Parsing Full testing dataset
    XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt')
    YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt')

    #Fitting data using QDA classifier
    clf = QDA()
    clf.fit(XFull, YFull.flatten())

    #Testing the results
    precision,recall,fscore = common.checkAccuracy(clf.predict(XFullTest),YFullTest,[1,2,3,4,5,6])
    print fscore
def runQDA(fileNamaParam, trainizingSizeParam):
  # what percent will you use ? 
  testSplitSize = 1.0 - trainizingSizeParam
  testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam)
  trainData = testAndTrainData[0]
  testData = testAndTrainData[1]
  ### classification   
  ## get the test and training sets 
  featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split(trainData, testData, test_size=testSplitSize, random_state=0) 
  ## fire up the model   
  theQDAModel = QDA()
  theQDAModel.fit(featureSpace_train, vScore_train)
  thePredictedScores = theQDAModel.predict(featureSpace_test)
  #print "The original vector: "
  #print vScore_test
  #print "The predicted score vector: "
  #print thePredictedScores
  evalClassifier(vScore_test, thePredictedScores) 
예제 #11
0
def runQDA(fileNamaParam, trainizingSizeParam):
    # what percent will you use ?
    testSplitSize = 1.0 - trainizingSizeParam
    testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam)
    trainData = testAndTrainData[0]
    testData = testAndTrainData[1]
    ### classification
    ## get the test and training sets
    featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split(
        trainData, testData, test_size=testSplitSize, random_state=0)
    ## fire up the model
    theQDAModel = QDA()
    theQDAModel.fit(featureSpace_train, vScore_train)
    thePredictedScores = theQDAModel.predict(featureSpace_test)
    #print "The original vector: "
    #print vScore_test
    #print "The predicted score vector: "
    #print thePredictedScores
    evalClassifier(vScore_test, thePredictedScores)
def qda(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans qda")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        qda=QDA()
        qda.fit(X,y)
        y_pred = qda.predict(X)
        print "#########################################################################################################\n"
        print "Quadratic Discriminant Analysis Accuracy "
        print "classification accuracy:", metrics.accuracy_score(y, y_pred)
        print "precision:", metrics.precision_score(y, y_pred)
        print "recall:", metrics.recall_score(y, y_pred)
        print "f1 score:", metrics.f1_score(y, y_pred)
        print "\n"
        print "#########################################################################################################\n"
        results = Output+"QDA_metrics.txt"
        file = open(results, "w")
        file.write("Quadratic Discriminant Analaysis estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y)):
            file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
        file.close()
        title = "QDA"
        save = Output + "QDA_confusion_matrix.png"
        plot_confusion_matrix(y, y_pred,title,save)
    except (AttributeError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans qda")
def qda(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans qda split_test")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        print X_train.shape, X_test.shape
        lda=QDA()
        lda.fit(X_train,y_train)
        y_pred = lda.predict(X_test)
        print "Quadratic Discriminant Analysis Accuracy "
        print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
        print "precision:", metrics.precision_score(y_test, y_pred)
        print "recall:", metrics.recall_score(y_test, y_pred)
        print "f1 score:", metrics.f1_score(y_test, y_pred)
        #LVLprint "\n"
        results = Output+"QDA_metrics_test.txt"
        file = open(results, "w")
        file.write("Quadratic Discriminant Analaysis estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y_test)):
            file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
        file.close()
        title = "QDA %f"%test_size
        save = Output + "QDA_confusion_matrix"+"_%s.png"%test_size
        plot_confusion_matrix(y_test, y_pred,title,save)
    except (AttributeError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics_test.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans qda split_test")
예제 #14
0
class QDAClassifier(Classifier):
    '''Quadratic Discriminant analysis classifier'''
    def __init__(self):
        super(QDAClassifier, self).__init__()
        self.fig = 20
        self.is_trainable = True
        self.is_trained = False

    def train(self, classification_data, indices=None, settings_name=None, **kwargs):
        super(QDAClassifier, self).train(classification_data, indices, settings_name, **kwargs)
        indices = self.settings['indices']

        self.qda = QDA(**self.classifier_kwargs)

        self.qda.fit(classification_data.data[:, indices], classification_data.are_hurr_actual)
        return self

    def classify(self, classification_data):
        super(QDAClassifier, self).classify(classification_data)
        indices = self.settings['indices']

        self.are_hurr_pred = self.qda.predict(classification_data.data[:, indices])
        return self.are_hurr_pred
# Quadratic Discriminant Analysis
from sklearn import datasets
from sklearn import metrics
from sklearn.qda import QDA
# load the iris datasets
dataset = datasets.load_iris()
# fit a QDA model to the data
model = QDA()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
예제 #16
0
# Preprocessing
for ind in range(X.shape[0]):
    X[ind, :] = where(X[ind, :] > mean(X[ind, :]), 0., 1.)

# PCA
pca = PCA(50)
pca.fit(X)
X50 = pca.transform(X)

# QDA
qda = QDA()
qda.fit(X50, y)

# Accuracy score
p = qda.predict(X50)
print 'Accuracy score = ', accuracy_score(p, y)

# Read in test data
test = pd.read_csv('test.csv')
test = test.values

# Preprocessing
for ind in range(test.shape[0]):
    test[ind, :] = where(test[ind, :] > mean(test[ind, :]), 0., 1.)

# Transform data
test = pca.transform(test)

# Predict
p = qda.predict(test)
E_neigh = accuracy_score(y_test, yhat_neigh, normalize=True)
print E_neigh
# On obtient E_neigh ~ 0.95

# II/3 prediction using Linear Discriminant Analysis (LDA)
LDA = LDA()
LDA.fit(X_train, y_train)
yhat_LDA = LDA.predict(X_test)
E_LDA = accuracy_score(y_test, yhat_LDA, normalize=True)
print E_LDA
# KNN better than LDA (E_LDA ~ 0.86)

# II/3 prediction using Quadratic Discriminant Analysis (QDA)
QDA = QDA()
QDA.fit(X_train, y_train)
yhat_QDA = QDA.predict(X_test)
E_QDA = accuracy_score(y_test, yhat_QDA, normalize=True)
print E_QDA
# voir pb colinéarité des variables

# II/4 prediction using Random Forests (RF)
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
yhat_RF = RF.predict(X_test)
E_RF = accuracy_score(y_test, yhat_RF, normalize=True)
print E_RF
# On obtient E_RF ~ 0.91
# Analysons plus finement le paramétrage des RF pour améliorer ce résultat
''' N.B. : pour analyser vraiment finement les résultats, avoir des résultats 
robustes, il faudrait faire des tirages aléatoires et répéter les étapes
au moins 200 fois et en faire une moyenne'''
예제 #18
0
# classifier regularization parameter
p_best = 0
count_best = 0

# begin training model
print 'Training model in progress...'

for j in range(200):
    p = 0.02*j
    clf = QDA(reg_param=p)
    clf.fit(X_train, y_train)
    count = 0
    
    # fit in the test set
    for i in range(len(y_cross)):
        a = clf.predict(X_cross[i])
        b = y_cross[i]
        if (a == b):
            count += 1
            
    # update the regularization parameter
    if count > count_best:
        count_best = count
        p_best = p
        
    print "Progress at %.1f%%" %(j/2)

print 'Training model completed' 

# test the model
clf = QDA(reg_param=p_best)
예제 #19
0
lda.fit(train_weekly_x, train_weekly_y)
lda_preds = lda.predict(test_weekly_x)
lda_score = lda.score(test_weekly_x, test_weekly_y)
conf_matrix = confusion_matrix(test_weekly_y, lda_preds)

print "\nLDA Results"
print "Confusion Matrix:"
print conf_matrix
print "Fraction of Correct Predictions: " + str(lda_score)

#%% QDA using sklearn
from sklearn.qda import QDA

qda = QDA()
qda.fit(train_weekly_x, train_weekly_y)
qda_preds = qda.predict(test_weekly_x)
qda_score = qda.score(test_weekly_x, test_weekly_y)
conf_matrix = confusion_matrix(test_weekly_y, qda_preds)

print "\nQDA Results"
print "Confusion Matrix:"
print conf_matrix
print "Fraction of Correct Predictions: " + str(qda_score)

#%% KNN using sklearn
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train_weekly_x, train_weekly_y)
knn_preds = knn.predict(test_weekly_x)
knn_score = knn.score(test_weekly_x, test_weekly_y)
예제 #20
0
        remove = remove.union(redundant)
    
    print("For correlation coefficient = ", coefficient)
    #print(remove)
    #print(add)

    train_data = pd.DataFrame(data=train_data_g, columns = df.columns)[df.columns- remove].values
    test_data = pd.DataFrame(data=test_data_g, columns = df.columns)[df.columns- remove].values
    print("num of featurs = ", train_data.shape[1])

    clf = QDA();

    # This gets the time in ipython shell.
    print("Modelling time:")
    %time clf.fit(train_data, train_labels)
    print("Modelling time ends")

    print("prediction time starts:")
    %time predicted_labels = clf.predict(test_data)
    print("prediction time ends")
    #print(classification_report(test_labels, clf.predict(test_data)))
    print(classification_report(test_labels, predicted_labels))

    print("num of featurs = ", train_data.shape[1])
    y_true = test_labels;
    y_pred_proba = clf.predict_proba(test_data);
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba[:, 1])
    roc_auc = auc(fpr, tpr)
    print("ROC AUC =", roc_auc)
    print("\n\n\n")
예제 #21
0
error(y_test, y_predict_g, 'Gaussian Naive Bayes')
error6(y_test, y_predict_g, 'Gaussian Naive Bayes')

###############################################################################
# 3 LDA  
LDA = LDA()
LDA.fit(X_train, y_train)
y_predict_lda = LDA.predict(X_test)
error(y_test, y_predict_lda, 'LDA')
error6(y_test, y_predict_lda, 'LDA')

###############################################################################
# 4 QDA  
QDA = QDA()
QDA.fit(X_train, y_train)
y_predict_qda = QDA.predict(X_test)
error(y_test, y_predict_qda, 'QDA')
error6(y_test, y_predict_qda, 'QDA')

###############################################################################
# 5 Logistic Regression 
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_predict_lr = LR.predict(X_test)
error(y_test, y_predict_lr, 'Logistic Regression')
error6(y_test, y_predict_lr, 'Logistic Regression')

###############################################################################
# 6 K-Neighbors Classifier 
KNC = KNeighborsClassifier(8)
KNC.fit(X_train, y_train)
                 'PitchesVar[6]','PitchesVar[7]','PitchesVar[8]','PitchesVar[9]','PitchesVar[10]','PitchesVar[11]',

                 'TimbreMean[0]','TimbreMean[1]','TimbreMean[2]','TimbreMean[3]','TimbreMean[4]','TimbreMean[5]',
                 'TimbreMean[6]','TimbreMean[7]','TimbreMean[8]','TimbreMean[9]','TimbreMean[10]','TimbreMean[11]',

                 'TimbreVar[0]','TimbreVar[1]','TimbreVar[2]','TimbreVar[3]','TimbreVar[4]','TimbreVar[5]',
                 'TimbreVar[6]','TimbreVar[7]','TimbreVar[8]','TimbreVar[9]','TimbreVar[10]','TimbreVar[11]']

    df_input = pandas.read_csv('pandas_merged_output_cleaned_None.csv',
                               header=None, delimiter="|", names=col_input)
    df_input = df_input.dropna()

    #df_input = df_input[df_input['Year'] != 0][df_input['genre'] != 'CLASSICAL']
    #df_input = df_input[df_input['Year'] != 0][df_input['Year'] < 1992][df_input['genre'] != 'CLASSICAL']
    df_input = df_input[df_input['Year'] != 0][df_input['Year'] >= 1992][df_input['genre'] != 'CLASSICAL']

    df_input_target = df_input[list(range(0, 1))].as_matrix()
    df_input_data = df_input[list(range(1, 70))].as_matrix()

    # splitting the data into training and testing sets
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(df_input_data, df_input_target.tolist())

    # Start QDA Classification
    from sklearn.qda import QDA
    clf = QDA(priors=None, reg_param=0.001).fit(X_train, np.ravel(y_train[:]))
    predicted = clf.predict(X_test)
    matches = (predicted == [item for sublist in y_test for item in sublist])
    print "Accuracy : ", (matches.sum() / float(len(matches)))

    else:
        y_home.append(1)

    all_data.append(data_instance)

# --- QDA Classification ---
print 'Fitting data'
home_qda = QDA()
home_qda.fit(all_data, y_home)

correct = 0
print 'Predicting results: '
for i in range(training_size, size, 2):
    inst = x[i].reshape(1, -1)

    ph = home_qda.predict(inst)

    if ph == 0:
        if y[i] > y[i + 1]:
            correct += 1

    elif ph == 2:
        if y[i] == y[i + 1]:
            correct += 1
    else:
        if y[i] < y[i + 1]:
            correct += 1

error_rate = (correct * 1.0) / ((size - training_size) / 2)
print 'Success rate for QDA: ', error_rate
예제 #24
0
#np.concatenate, np.hstack and np.insert didn't work, because axis has to larger than 1.
#if use np.concatenate, np.hstack and np.insert, np.expand_dims should be used firstly.
X = np.column_stack((X, training_zeros))
lda = LDA(n_components=1)
lda = lda.fit(X, y)
predict_LDA= lda.predict(testSet)
res = pd.crosstab(predict_LDA, actual_testSet)
print(res)
correct_rate= np.mean(actual_testSet == predict_LDA)
print('overall fraction of correct predictions: {correct_rate}'.format(correct_rate=correct_rate))

#Question F
print('(f) ')
qda = QDA()
qda = qda.fit(X, y)
predict_QDA = qda.predict(testSet)
res = pd.crosstab(predict_QDA, actual_testSet)
print(res)
correct_rate= np.mean(actual_testSet == predict_QDA)
print('overall fraction of correct predictions: {correct_rate}'.format(correct_rate=correct_rate))

#Question G
print('(g) ')
knn = KNeighborsClassifier(n_neighbors=1)
knn = knn.fit(X, y)
predict_KNN = knn.predict(testSet)
res = pd.crosstab(predict_KNN, actual_testSet)
print(res)
correct_rate= np.mean(actual_testSet == predict_KNN)
print('overall fraction of correct predictions: {correct_rate}'.format(correct_rate=correct_rate))
예제 #25
0
# Similar as LDA, need not assume same covariance between classes.
from sklearn.qda import QDA
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])

# Visualization
import matplotlib.pyplot as plt
plt.figure(1)
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='g')
plt.scatter(X[y == 2, 0], X[y == 2, 1], color='b')
plt.title('X Data Set Visualization')

# Classification
clf = QDA()
clf = clf.fit(X, y)

print(clf.predict([[-0.8, -1]]))
plt.show()
예제 #26
0
        test.append(temp)
    for line in trainData:
        temp=[]
        line = line.split(',')
        if line[0]=='x.1':
            continue
        for i in range(0,len(line)):
            if i<0:
                trainTarget.append(float(line[i].strip()))
            else:
                temp.append(float(line[i].strip()))
        train.append(temp)
    return train,test,trainTarget,testTarget
if __name__ =='__main__':
    fn1='./test.data'
    fn2='./train.data'
    fn3='./testTarget.data'
    fn4='./trainTarget.data'
    train,test,trainTarget,testTarget=InitData(fn1,fn2,fn3,fn4)
    clf = QDA()
    clf.fit(train,trainTarget)
    y_pred=clf.predict(test)
    print "error"
    print 1-np.mean(y_pred==testTarget)






예제 #27
0
    def classifierTrainTest(score, diagn, real_art, cvPartition, classifier,
                            subjIndex, preAccMatrix, preInstOrder):
        x = 0
        iteration = 0
        idx = 0
        PCNo = len(score[0])
        subAccMatrix = 0
        # FIX: what is test->matlab function within cvpartition class
        #idx = numpy.random.rand(cvPartition, iteration)
        #idx_test = numpy.where(idx == 1)
        #idx_train = numpy.where(idx != 1)
        print("cvPartition:")
        print(cvPartition)

        #QUESTION: cv partition not scalar ,how works
        #iteration must be atleast 2
        for idx_train, idx_test in cvPartition:
            #change idx to boolean array
            idx = numpy.zeros((len(score), 1), dtype=bool)
            for index in idx_test:
                idx[index] = True

            #for testing purposes
            #idx = numpy.zeros((len(score), 1), dtype=bool)
            #idx[47] = True

            #idx is all training in MATLAB implementation?
            cvTEST = numpy.zeros((sum(idx), PCNo))
            diagnTEST = numpy.zeros((sum(idx), 1))
            real_artTEST = numpy.zeros((sum(idx), 1))
            instIndexTEST = numpy.zeros((sum(idx), 1))

            cvTRAIN = numpy.zeros((len(idx) - sum(idx), PCNo))
            diagnTRAIN = numpy.zeros((len(idx) - sum(idx), 1))
            real_artTRAIN = numpy.zeros((len(idx) - sum(idx), 1))

            k = 0
            m = 0

            for j in range(len(idx)):
                if idx[j] == 1:
                    cvTEST[k, :] = score[j, :]
                    diagnTEST[k] = diagn[j]
                    real_artTEST[k] = real_art[j]
                    instIndexTEST[k] = subjIndex[j]
                    k = k + 1
                else:
                    cvTRAIN[m, :] = score[j, :]
                    diagnTRAIN[m] = diagn[j]
                    real_artTRAIN[m] = real_art[j]
                    m = m + 1

            # FIX: use scikit-learn for classifiers and predictions
            if classifier == "lda":
                #ldaModel = LDA()
                priorsArrays = numpy.array((.5, .5))
                ldaModel = LDA(solver='eigen',
                               priors=priorsArrays,
                               shrinkage=1.00)
                #ldaModel = LDA()
                ldaModel.fit(cvTRAIN, diagnTRAIN)
                label = ldaModel.predict(cvTEST)
            elif classifier == 'qda':
                # training a quadratic discriminant classifier to the data
                #qdaModel = QDA()
                priorsArrays = numpy.array((.5, .5))
                qdaModel = QDA(solver='eigen',
                               priors=priorsArrays,
                               shrinkage=1.00)
                qdaModel.fit(cvTRAIN, diagnTRAIN)
                label = qdaModel.predict(cvTEST)
            elif classifier == 'tree':
                # training a decision tree to the data
                treeModel = tree()
                treeModel.fit(cvTRAIN, diagnTRAIN)
                label = treeModel.predict(cvTEST)
            elif classifier == 'svm':
                # training a support vector machine to the data
                svmModel = SVC()
                svmModel.fit(cvTRAIN, diagnTRAIN)
                label = svmModel.predict(cvTEST)

            trueClassLabel = diagnTEST
            predictedClassLabel = label

            #from former loop

            subAccMatrix = numpy.column_stack(
                (trueClassLabel, predictedClassLabel, real_artTEST))
            preAccMatrix[x:x + len(subAccMatrix[:, 0]), :] = subAccMatrix
            preInstOrder[x:x + len(instIndexTEST[:, 0])] = instIndexTEST

            x = x + len(subAccMatrix[:, 0])

            #for testing purposes
            #break
        # create dictionary for return values
        return {
            'cvTEST': cvTEST,
            'diagnTEST': diagnTEST,
            'real_artTEST': real_artTEST,
            'instIndexTEST': instIndexTEST,
            'cvTRAIN': cvTRAIN,
            'diagnTRAIN': diagnTRAIN,
            'real_artTRAIN': real_artTRAIN,
            'trueClassLabel': trueClassLabel,
            'predictedClassLabel': predictedClassLabel,
            'idx': idx,
            'subAccMatrix': subAccMatrix,
            'preAccMatrix': preAccMatrix,
            'preInstOrder': preInstOrder
        }
예제 #28
0
# classifier regularization parameter
p_best = 0
count_best = 0

# begin training model
print 'Training model in progress...'

for j in range(200):
    p = 0.02 * j
    clf = QDA(reg_param=p)
    clf.fit(X_train, y_train)
    count = 0

    # fit in the test set
    for i in range(len(y_cross)):
        a = clf.predict(X_cross[i])
        b = y_cross[i]
        if (a == b):
            count += 1

    # update the regularization parameter
    if count > count_best:
        count_best = count
        p_best = p

    print "Progress at %.1f%%" % (j / 2)

print 'Training model completed'

# test the model
clf = QDA(reg_param=p_best)
예제 #29
0
			Test.append(1)
		if(i.split(',')[4]=='Iris-virginica'):
			Test.append(2)
	except:
		pass
h=0.02
Y=Test
X=numpy.transpose(Data)
#clf=LDA()
#clf=QDA()
clf=QDA(reg_param=0.3)
x=clf.fit(Data,Test)
x_min, x_max = X[0].min() - .5, X[0].max() + .5
y_min, y_max = X[1].min() - .5, X[1].max() + .5
xx, yy = numpy.meshgrid(numpy.arange(x_min, x_max, h), numpy.arange(y_min, y_max, h))
Z = clf.predict(numpy.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[0], X[1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
예제 #30
0
파일: QDA.py 프로젝트: pyAddict/Pvt-Repo
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
data_nonLinear = np.genfromtxt('exp2d2.txt', delimiter=',')
temp = pd.DataFrame(data_nonLinear)
#Data Visualization
temp.head()
x = data_nonLinear[:, 0:2]
y = data_nonLinear[:, np.newaxis, 2]
y = y.ravel()
x_train, x_test, y_train, y_test = train_test_split(x, y)
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.qda import QDA
qdd = QDA()
#ldd=LinearDiscriminantAnalysis()
qdd.fit(x_train, y_train)
y_pred = qdd.predict(x_test)
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))
#Plotting
import matplotlib.pyplot as plt
x1 = x_test[:, 0]
x2 = x_test[:, 1]
for i in range(0, len(y_test)):
    c = ['red' if y_test[i] == 1 else 'blue']
    plt.scatter(x1[i], x2[i], color=c)
    plt.hold(True)
plt.hold(False)
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
예제 #31
0
for i in range(1,11):
	sklearn_pca = PCA(n_components=i)
	Xred_pca = sklearn_pca.fit_transform(X_std)
	Xred_pca_test = sklearn_pca.transform(X_std_test)

	lda_model = LDA()
	lda_model.fit(Xred_pca,y)
	yhat_train = lda_model.predict(Xred_pca)
	lda_train.append(zero_one_loss(y, yhat_train)) 
	yhat_test = lda_model.predict(Xred_pca_test)
	lda_test.append(zero_one_loss(ytest, yhat_test)) 

	qda_model = QDA()
	qda_model.fit(Xred_pca,y)
	yhat_train = qda_model.predict(Xred_pca)
	qda_train.append(zero_one_loss(y, yhat_train)) 
	yhat_test = qda_model.predict(Xred_pca_test)
	qda_test.append(zero_one_loss(ytest, yhat_test)) 

	knn_model = KNeighborsClassifier(n_neighbors=7)
	knn_model.fit(Xred_pca,y)
	yhat_train = knn_model.predict(Xred_pca)
	knn_train.append(zero_one_loss(y, yhat_train)) 
	yhat_test = knn_model.predict(Xred_pca_test)
	knn_test.append(zero_one_loss(ytest, yhat_test)) 

plt.figure(figsize=(12, 8))
plt.plot(lda_train, label="Training set")
plt.plot(lda_test, label="Test set")
class SNPForecastingStrategy(Strategy):
  """Requires:
    symbol - A stock symbol to form a strategy.
    bars - A df of bars for the above symbol."""
    
  def __init__(self, symbol, bars):
    self.symbol = symbol
    self.bars = bars
    self.create_periods()
    self.fit_model()
    
  def create_periods(self):
    """Create training/test periods."""
    self.start_train = datetime.datetime(2001,1,10)
    self.start_test = datetime.datetime(2005,1,1)
    self.end_period = datetimme.datetime(2005,12,31)
    
  def fit_model(self):
    """Fits a Quadratic Discriminat Analyser to the US
    sock market index (^GPSC in Yahoo)."""
    # Create a laggged series of the S&P500 US stock market index
    
    snpret =  create_lagged_series(self.symbol, self.start_train,
                self.end_period, lags=5)
    
    # Use the prior two days of returns as 
    # predictor value, with direction as the response
    X = snpret[["Lag1", "Lag2"]]
    y = snpret["Direction"]
    
    # Create training and test sets
    X_train = X[X.index < self.start_test]
    y_train = y[y.index < self.start_test]
    
    # Create the prediciting factors for use
    # in direction forecasting.
    self.predictors = X[X.index >= self.start_test]
    
    # Create the Quadractic Discriminant Analysis model
    # and the forcasting strategy
    self.model = QDA()
    self.model.fit(X_train, y_train)
    
  def generate_signals(self):
    """Returns the df of symbols containing the signals
    to go long, short, or hold (1, -1, 0)."""
    signals = pd.DataFrame(index=self.bars.index)
    signals['signal'] = 0.0
    
    # predict the subsequent period with the QDA model
    signals['signal'] = self.model.predict(self.predictors)
    
    # Remove the first five signals entries to eliminate
    # NaN issues with the signal df
    signals[''signal'][0:5] = 0.0
    signals['positions'] = signals['signal'].diff()
    
    return signals
    
  class MarketingIntradayPortfolio(Portfolio):
    """Buys or sells 500 shares of an asset at the opening price
    of every bar, depending upon the direction of the forcast,
    closing out the trade at the close of the bar.
    
    Requires:
    symbol - A stock symbol which forms the basis of the portfolio.
    bars - A df of bars for a symbol set.
    signal - A df of signals (1, 0, -1) for each symbol
    initial_capital - The amount in cash at the start of the portfolio."""
    
    def __init__(self, symbol, bars, signals, initial_capital=100000.0):
      self.symbol = symbol
      self.bars = bars
      self.signals = signals
      self.initial_capital = float(initial_capital)
      self.positions = self.generate_positions()
      
    def generate_positions(self):
      """Generate the positions df, based on the signals provided by the 'signals; df."""
      positions = pd.DataFrame(index=self.signals.index).fillna(0.0)
      
      # Long or short 500 shares of SPY based on directional signal every day
      positions[self.symbol] = 500*self.signals['signal']
      return positions
      
    def backtest_portfolio(self):
      """Backtest the portfolio and return a df containing
      the equity curve and the percentage returns.""
      
      # Set the portfolio object to have the same time period
      # as the positions df.
      portfolio = pd.DateFrame(index=self.positions.index)
      pos_diff = self.positions.diff()
      
      # Work out the intraday profit of the difference
      # in open and closing prices and then determine
      # the daily profit by longing if an up day is predicted 
      # and shorting if a down day is predicted
      
      portfolio['price_diff'] = self.bars['Close'] - self.bars['Open']
      portfolio['price_diff'][0:5] = 0.0
      portfolio['profit'] = self.positions[self.symbol] * portfolio['price_diff']
      
      # Generate the equity curve and percentage returns
      portfolio['total'] = self.initial_capital + portfolio['profit'].cumsum()
      portfolio['returns'] = portfolio['total'].pct_change()
      return portfolio
예제 #33
0
recall_lda_50 = pre_rec(cmatrix_lda_50, y_test_count)
precision_lda_50 = pre_rec(cmatrix_lda_50, y_pred_count_lda_50)
accuracy_lda_50 = overall_accuracy(cmatrix_lda_50, y_test)

print precision_lda_50
print recall_lda_50
print accuracy_lda_50
print "####################################################################"
                
###############################################################################
###############################################################################
# 3 QDA

qda = QDA()
qda.fit(X_train, y_train)
y_predict_qda = qda.predict(X_test)

y_pred_count_qda = total_count(y_predict_qda)
cmatrix_qda = confusion_matrix(y_test, y_predict_qda)

print "\nQDA:"
print cmatrix_qda
print ""

recall_qda = pre_rec(cmatrix_qda, y_test_count)
precision_qda = pre_rec(cmatrix_qda, y_pred_count_qda)
accuracy_qda = overall_accuracy(cmatrix_qda, y_test)

print precision_qda
print recall_qda
print accuracy_qda
예제 #34
0
#Question D
print('(d) ')
lda = LDA()
lda = lda.fit(trainingData, trainingResponse)
predict_LDA = lda.predict(testData)
res = pd.crosstab(predict_LDA, testResponse.values)
print(res)
error_rate = np.mean(predict_LDA != testResponse.values)
print('LDA test error is {error_rate}'.format(error_rate=error_rate))

#Question E
print('(e) ')
qda = QDA()
qda = qda.fit(trainingData, trainingResponse)
predict_QDA = qda.predict(testData)
res = pd.crosstab(predict_QDA, testResponse.values)
print(res)
error_rate = np.mean(predict_QDA != testResponse.values)
print('QDA test error is {error_rate}'.format(error_rate=error_rate))

#Question F
print('(f) ')
lr = LogisticRegression()
lr = lr.fit(trainingData, trainingResponse)
predict_LR = lr.predict(testData)
res = pd.crosstab(predict_LR, testResponse.values)
print(res)
error_rate = np.mean(predict_LR != testResponse.values)
print('Logistic Regression test error is {error_rate}'.format(error_rate=error_rate))
# *****************************************************************************
# Quadratic Discriminant Analysis
from sklearn import datasets
from sklearn import metrics
from sklearn.qda import QDA

# load the iris datasets
dataset = datasets.load_iris()

# fit a QDA model to the data
model = QDA()
model.fit(dataset.data, dataset.target)
print(model)

# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
예제 #36
0
파일: sk_qda.py 프로젝트: Ramlinbird/prml
# Similar as LDA, need not assume same covariance between classes.
from sklearn.qda import QDA
import numpy as np

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])

# Visualization
import matplotlib.pyplot as plt

plt.figure(1)
plt.scatter(X[y == 1, 0], X[y == 1, 1], color="g")
plt.scatter(X[y == 2, 0], X[y == 2, 1], color="b")
plt.title("X Data Set Visualization")

# Classification
clf = QDA()
clf = clf.fit(X, y)

print(clf.predict([[-0.8, -1]]))
plt.show()
def quadraticDiscriminantAnalysis(dataFile, outputFolder, regParam,parameters):
	inputData = yaml.load(open(dataFile))
	trainingSet = inputData['training']
	testingSet = inputData['testing']
	inputFile = inputData['inputFile']
	label = inputData['label']
	resultSet = []
	modelset = []
	importanceset = []
	
	if not os.path.exists(outputFolder):
		try:
			os.makedirs(outputFolder)
		except OSError as exc:
			if exc.errno != errno.EEXIST:
				raise exc
			pass

	modelsfolder = outputFolder + "/models/"
	if not os.path.exists(modelsfolder):
		try:
			os.makedirs(modelsfolder)
		except OSError as exc:
			if exc.errno != errno.EEXIST:
				raise exc
			pass

	importancefolder = outputFolder + "/FeatureImportance/"
	if not os.path.exists(importancefolder):
		try:
			os.makedirs(importancefolder)
		except OSError as exc:
			if exc.errno != errno.EEXIST:
				raise exc
			pass


	for i in range(len(trainingSet)):
		train_df = pd.read_csv(trainingSet[i])
		train_labels = train_df[label]
		train_features = train_df.drop(label,axis=1)
		test_df = pd.read_csv(testingSet[i])
		test_predictions = pd.DataFrame(test_df[label])
		test_features = test_df.drop(label,axis=1)

		qda = QDA(reg_param=regParam)
		qda.fit(train_features, train_labels)

		modelFile = modelsfolder + "QuadraticDiscriminantAnalysisModel" + str(i+1) + ".pkl"
		with open(modelFile, 'wb') as fd:
			pickle.dump(qda, fd)
		modelset.append(modelFile)

		fd.close()

		importanceFile = calculateFeatureImportance(train_features,qda,importancefolder,i)
		importanceset.append(importanceFile)

		test_predictions['predictions'] = qda.predict(test_features)

		resultFile = outputFolder + '/result' + str(i + 1) + '.csv'

		test_predictions.to_csv(resultFile,index=False)
		resultSet.append(resultFile)

	resultDict = dict()

	resultDict['results'] = resultSet
	resultDict['models'] = modelset
	resultDict['featureimportance'] = importanceset
	resultDict['label'] = label

	if not parameters:
		parameters['parameter']='default'
	resultDict['algo_params'] = parameters
	resultDict['split_params'] = inputData['split_params']
	if 'feature_selection_parameters' in inputData:
		resultDict['feature_selection_parameters'] = inputData['feature_selection_parameters']
		resultDict['feature_selection_algorithm'] = inputData['feature_selection_algorithm']
	if 'feature_extraction_parameters' in inputData:
		resultDict['feature_extraction_parameters'] = inputData['feature_extraction_parameters']
		resultDict['feature_extraction_algorithm'] = inputData['feature_extraction_algorithm']
	if 'preprocessing_params' in inputData:
		resultDict['preprocessing_params'] = inputData['preprocessing_params']
	resultDict['inputFile'] = inputFile
	resultDict['algorithm'] = "QuadraticDiscriminantAnalysis"
	yaml.dump(resultDict, open(outputFolder + '/results.yaml', 'w'))
예제 #38
0
std = np.std(dataset[:,0])
dataset[:,0] = (dataset[:,0] - mean) / std

for i in range(len(dataset)):
	if dataset[i,10] == 2:
		dataset[i, 10] = 0

target = dataset[:,-1]
data = dataset[:,:14]
expected = target

#QDA

clf = QDA()
clf.fit(data, target)
predicted = clf.predict(data)

#LDA

clf = LDA()
clf.fit(data, target)
predicted = clf.predict(data)

#Gaussian

model = GaussianNB()
model.fit(data, target)
print(model)

#data for prediction:
예제 #39
0
# KNN
knn1 = KNeighborsClassifier(n_neighbors=2)
knn1 = knn1.fit(X_train, y_train)
knn1.score(X_train, y_train)
knnpredict = knn1.predict(X_test)
print knnpredict
confusion_matrix(y_test, knnpredict)
print metrics.accuracy_score(y_test, knnpredict)
knn2 = KNeighborsClassifier(n_neighbors=10)
knn2 = knn2.fit(X_train, y_train)
knn2.score(X_train, y_train)
knnpredict1 = knn2.predict(X_test)
print knnpredict1
confusion_matrix(y_test, knnpredict1)
print metrics.accuracy_score(y_test, knnpredict1)

# QDA

qda1 = QDA()
qda1 = qda1.fit(X_train, y_train)
qda1.score(X_train, y_train)
qdapredict = qda1.predict(X_test)
print qdapredict
confusion_matrix(y_test, qdapredict)
print metrics.accuracy_score(y_test, qdapredict)

# Strategies to improve the test accuracy
#We can do algorithm tuning Since machine learning algorithms are driven by parameters, these parameters will influence the outcome of learning.Objective of this is to find optimum value for each parameter to improve accuracy of the model. We should check the impact of each of these parameters on the model.
# By applying ensemble methods such as bagging and boosting we can improve the accuracy of the  model
예제 #40
0
 def qda_fit(self):
     qda_res = QDA().fit(self.train_X.values, self.train_y.values)
     pred_y = qda_res.predict(self.test_X.values)
     tp.output_table(pred_y, self.test_y.values)
def quadraticDiscriminantAnalysis(dataFile, outputFolder, regParam,parameters):
	inputData = yaml.load(open(dataFile))
	trainingSet = inputData['training']
	testingSet = inputData['testing']
	inputFile = inputData['inputFile']
	label = inputData['label']
	resultSet = []
	if not os.path.exists(outputFolder):
		try:
			os.makedirs(outputFolder)
		except OSError as exc:
		    if exc.errno != errno.EEXIST:
		        raise exc
		    pass
	for i in range(len(trainingSet)):
		"""testPredictions = []
		trainLabels = []
		trainFeatures = []
		trainDataSet = arff.load(trainingSet[i])
		for row in trainDataSet:
			content = list(row)
			trainFeatures.append(content[0:len(content)-1])
			trainLabels.append(content[len(content)-1])
		testFeatures = []
		testLabels = []
		testDataSet = arff.load(testingSet[i])
		for row in testDataSet:
			content = list(row)
			testFeatures.append(content[0:len(content)-1])
			testLabels.append(content[len(content)-1])"""
		train_df = pd.read_csv(trainingSet[i])
		train_labels = train_df[label]
		train_features = train_df.drop(label,axis=1)
		test_df = pd.read_csv(testingSet[i])
		test_predictions = pd.DataFrame(test_df[label])
		test_features = test_df.drop(label,axis=1)

		qda = QDA(reg_param=regParam)
		qda.fit(train_features, train_labels)
		test_predictions['predictions'] = qda.predict(test_features)
		#testPredictions = np.array(qda.predict(testFeatures)).tolist()
		resultFile = outputFolder + '/result' + str(i + 1) + '.csv'
		"""with open(resultFile,'w') as outfile:
			outfile.write('predictions:\n')
			outfile.write(yaml.dump(testPredictions, default_flow_style=False))
			outfile.write('true_labels:\n')
			outfile.write(yaml.dump(testLabels, default_flow_style=False))"""
		test_predictions.to_csv(resultFile,index=False)
		resultSet.append(resultFile)
	resultDict = dict()
	#parameters = dict()
	resultDict['results'] = resultSet
	resultDict['label'] = label
	#parameters['parameter.p'] = regParam
	if not parameters:
		parameters['parameter']='default'
	resultDict['algo_params'] = parameters
	resultDict['split_params'] = inputData['split_params']
	if 'feature_selection_parameters' in inputData:
        resultDict['feature_selection_parameters'] = inputData['feature_selection_parameters']
        resultDict['feature_selection_algorithm'] = inputData['feature_selection_algorithm']
    if 'feature_extraction_parameters' in inputData:
        resultDict['feature_extraction_parameters'] = inputData['feature_extraction_parameters']
        resultDict['feature_extraction_algorithm'] = inputData['feature_extraction_algorithm']
	if 'preprocessing_params' in inputData:
		resultDict['preprocessing_params'] = inputData['preprocessing_params']
	resultDict['inputFile'] = inputFile
	resultDict['algorithm'] = "QuadraticDiscriminantAnalysis"
	yaml.dump(resultDict, open(outputFolder + '/results.yaml', 'w'))

def main(args):
	inputFile = ''
	outputFolder = ''
	parameters=dict()
	regParam = 0.0 #float; regularizes the covariance estimate as [(1-reg_param)*Sigma + reg_param*np.eye(n_features)]
	try:
		opts,args = getopt.getopt(args, "i:o:p:", [])
	except getopt.GetoptError:
		print 'QuadraticDiscriminantAnalysis.py -i <inputFile> -o <outputFolder> -p <regParam>'
		sys.exit(2)
	for opt,arg in opts:
		if opt == '-i':
			inputFile = arg
		elif opt == '-o':
			outputFolder = arg
		elif opt == '-p':
			regParam = float(arg)
			parameters['parameter.p']=arg
	quadraticDiscriminantAnalysis(inputFile, outputFolder, regParam,parameters)
if __name__ == "__main__":
   main(sys.argv[1:])
예제 #42
0
recall_lda = pre_rec(cmatrix_lda, y_test_count)
precision_lda = pre_rec(cmatrix_lda, y_pred_count_lda)
accuracy_lda = overall_accuracy(cmatrix_lda, y_test)

print precision_lda
print recall_lda
print accuracy_lda
                  
###############################################################################
###############################################################################
# 3 QDA

qda = QDA()
qda.fit(X_train, y_train)
y_predict_qda = qda.predict(X_test)

y_pred_count_qda = total_count(y_predict_qda)
cmatrix_qda = confusion_matrix(y_test, y_predict_qda)

print "\nQDA:"
print cmatrix_qda
print ""

recall_qda = pre_rec(cmatrix_qda, y_test_count)
precision_qda = pre_rec(cmatrix_qda, y_pred_count_qda)
accuracy_qda = overall_accuracy(cmatrix_qda, y_test)

print precision_qda
print recall_qda
print accuracy_qda
예제 #43
0
    N_st = np.sum(y == 0)
    N_rr = N_tot - N_st
    N_train = len(y_train)
    N_test = len(y_test)
    N_plot = 5000 + N_rr

    #----------------------------------------------------------------------
    # perform QDA
    classifiers = []
    predictions = []
    Ncolors = np.arange(1, X.shape[1] + 1)

    for nc in Ncolors:
        clf = QDA()
        clf.fit(X_train[:, :nc], y_train)
        y_pred = clf.predict(X_test[:, :nc])

        classifiers.append(clf)
        predictions.append(y_pred)

    predictions = np.array(predictions)

    completeness, contamination = completeness_contamination(
        predictions, y_test)

    print "completeness", completeness
    print "contamination", contamination

    #------------------------------------------------------------
    # Compute the decision boundary
    clf = classifiers[1]
예제 #44
0
파일: qda.py 프로젝트: Banaei/ces-ds
display_1 = [2, 2]
display_2 = [3, 1]
display_3 = [2.5, 2.5]

values_proba_qda_1 = np.exp(clf.predict_log_proba(display_1))[0]
values_proba_qda_2 = np.exp(clf.predict_log_proba(display_2))[0]
values_proba_qda_3 = np.exp(clf.predict_log_proba(display_3))[0]

fig3 = plt.figure()
plot_2d(X, y)

resolution_param = 500  # 500 for nice plotting, 50 for fast version
color_text = '#ff8101'

frontiere(lambda xx: clf.predict(xx), X, step=resolution_param)

plt.annotate(r'' + '(%.2f' % values_proba_qda_1[0] + ', %.2f'
             % values_proba_qda_1[1] + ', %.2f)' % values_proba_qda_1[2],
             xy=(display_1[0], display_1[1]), xycoords='data',
             color=color_text, xytext=(-150, +100), textcoords='offset points',
             fontsize=12, arrowprops=dict(arrowstyle="->",
             connectionstyle="arc3,rad=.2", color=color_text))

plt.plot(display_1[0], display_1[1], 'o', color=color_text, markersize=12)
plt.annotate(r'' + '(%.2f' % values_proba_qda_2[0] + ', %.2f'
             % values_proba_qda_2[1] + ', %.2f)' % values_proba_qda_2[2],
             xy=(display_2[0], display_2[1]), xycoords='data',
             color =color_text, xytext=(-150, -40), textcoords='offset points',
             fontsize=12, arrowprops=dict(arrowstyle="->",
             connectionstyle="arc3,rad=.2", color=color_text))
예제 #45
0
N_st = np.sum(y == 0)
N_rr = N_tot - N_st
N_train = len(y_train)
N_test = len(y_test)
N_plot = 5000 + N_rr

#----------------------------------------------------------------------
# perform QDA
classifiers = []
predictions = []
Ncolors = np.arange(1, X.shape[1] + 1)

for nc in Ncolors:
    clf = QDA()
    clf.fit(X_train[:, :nc], y_train)
    y_pred = clf.predict(X_test[:, :nc])

    classifiers.append(clf)
    predictions.append(y_pred)

predictions = np.array(predictions)

completeness, contamination = completeness_contamination(predictions, y_test)

print "completeness", completeness
print "contamination", contamination

#------------------------------------------------------------
# Compute the decision boundary
clf = classifiers[1]
xlim = (0.7, 1.35)
예제 #46
0
model = LogisticRegression(multi_class='multinomial',
                           solver='newton-cg',
                           C=100)
#create extended features
xfeatures = np.concatenate((features, features[:, 0:1] * features[:, 1:2]), 1)
y_pred = model.fit(xfeatures, labels[:, 0]).predict(xfeatures)
y_pred = y_pred[:, np.newaxis]
aux = (y_pred != labels)
aux = np.sum(aux.astype(float), 0)
misclassificationRate = aux / labels.size
print(misclassificationRate)

print('Nearest Neighbour')
model = KNeighborsClassifier(n_neighbors=1, algorithm='brute')
model.fit(features, labels[:, 0])
y_pred = model.predict(features)
y_pred = y_pred[:, np.newaxis]
aux = (y_pred != labels)
aux = np.sum(aux.astype(float), 0)
misclassificationRate = aux / labels.size
print(misclassificationRate)

print('Support Vector Machine')
model = SVC(kernel='poly', degree=2, coef0=1.0, C=100)
y_pred = model.fit(features, labels[:, 0]).predict(features)
y_pred = y_pred[:, np.newaxis]
aux = (y_pred != labels)
aux = np.sum(aux.astype(float), 0)
misclassificationRate = aux / labels.size
print(misclassificationRate)