예제 #1
0
		#for x in X:
		 #  print x
		model = LogisticRegression()
		#from sklearn.svm import SVC
		#model=SVC(kernel='linear')
		#rfe = RFE(model, k)
		#rfe = rfe.fit(Xtrain, ytrain)
                clf=SelectKBest(chi2, k=k).fit_transform(Xtrain, ytrain)
		#print(rfe.support_)
		#print(rfe.ranking_) This is one of the results expected
		Xcv=cv[:,2:]
		ycv=cv[:,1]
		p=int(0)
		n=int(0)
		#pred=rfe.predict(Xcv)
		pred=clf.predict(Xcv)
		#print "pred set before matrix %s" % str(np.shape(pred))
		#print "ycv set before matrix %s" % str(np.shape(ycv))
		J+=float(f1_score(ycv,pred))
		count+=1
        f1score=float(J/count)
	print "No of features %d f1_score %f" % (k,f1score)
        if f1score>F1max:
		F1max=f1score
		n_features=k
print n_features,F1max
Xtrain=f[:,2:]
ytrain=f[:,1]
model = LogisticRegression()
rfe = RFE(model, n_features)
rfe = rfe.fit(Xtrain, ytrain)
예제 #2
0
def acc(classifier, mdict, splits=10, fselect='', nfeat=100, fmin=0, fmax=1000, a=.05, thresh=0):

    acc = []
    acc_tr = []

    # load data
    phi = mdict.get('phi')
    testPhi = mdict.get('testPhi')

    for i in range(0, splits):

        X = phi[(i, 0)]
        s = X.shape
        if len(s) == 3:
            X = np.reshape(X, [s[0], s[1] * s[2]])
        else:
            X = np.reshape(X, [s[0], s[1]])
        # random data
        rows = random.sample(range(0, s[0]), 4)
        rcts = X[rows, ]
        rlabs = range(0, 4)
        classifier.fit(rcts, rlabs)
        y = classifier.predict(X)
        y = np.reshape(y, s[0])
        X_test = testPhi[(i, 0)]
        s = X_test.shape
        if len(s) == 3:
            X_test = np.reshape(X_test, [s[0], s[1] * s[2]])
        else:
            X_test = np.reshape(X_test, [s[0], s[1]])
        y_test = classifier.predict(X_test)
        y_test = np.reshape(y_test, s[0])

        # subset features
        if 'min' in fselect:
            cols = X.astype(bool).sum(axis=0) > fmin
            X = X[:, cols]
            X_test = X_test[:, cols]
        if 'max' in fselect:
            cols = X.astype(bool).sum(axis=0) < fmax
            X = X[:, cols]
            X_test = X_test[:, cols]
        if 'thresh' in fselect:
            X[X < thresh] = 0
            X_test[X_test < thresh] = 0

        if 'MI' in fselect:
            model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif 'PCA'in fselect:
            model = PCA(n_components=nfeat).fit(X)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif 'reg' in fselect:
            model = SelectFpr(f_classif, alpha=a).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif 'kbest' in fselect:
            model = SelectKBest(f_classif, k=nfeat).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)

        # fit model
        model = classifier.fit(X, y)

        # Compute accuracy for validation set
        y_hat = model.predict(X_test)
        acc.append(sum(y_hat == y_test)/len(y_test))

        # Compute accuracy for training set
        y_hat = model.predict(X)
        acc_tr.append(sum(y_hat == y) / len(y))

        i += 1

    results = stats.ttest_1samp(acc, popmean=755/2126)
    p_val = results[1]

    results = stats.ttest_1samp(acc_tr, popmean=755/2126)
    p_val_tr = results[1]

    return np.mean(acc), np.std(acc), p_val, np.mean(acc_tr), np.std(acc_tr), p_val_tr
예제 #3
0
def acc(classifier,
        fname,
        yfname=None,
        root='./data/',
        fselect='min',
        nfeat=100,
        fmin=0,
        fmax=1000,
        a=.05,
        thresh=0):

    # load data
    mdict = scipy.io.loadmat(root + fname)  # import dataset from matlab
    if yfname is None:
        ymdict = mdict
    else:
        ymdict = scipy.io.loadmat(root + yfname)  # import dataset from matlab
    phi = mdict.get('phi')
    testPhi = mdict.get('testPhi')
    asd = ymdict.get('asd')
    testASD = ymdict.get('asdTe')

    X = phi[(0, 0)]
    if isinstance(X, np.void):
        s = X[2][0]
        X = torch.sparse.FloatTensor(
            torch.from_numpy(X[0].astype(dtype='float32') - 1).t().type(
                torch.LongTensor),
            torch.from_numpy(X[1][:, 0].astype(dtype='float32')),
            torch.Size(tuple(s)))
        X = X.to_dense().reshape(s[0], -1).numpy()
    else:
        X = phi
        s = X.shape
    y = asd
    y = np.reshape(y, s[0])
    Xt = testPhi[(0, 0)]
    if isinstance(Xt, np.void):
        s = Xt[2][0]
        Xt = torch.sparse.FloatTensor(
            torch.from_numpy(Xt[0].astype(dtype='float32') - 1).t().type(
                torch.LongTensor),
            torch.from_numpy(Xt[1][:, 0].astype(dtype='float32')),
            torch.Size(tuple(s)))
        Xt = Xt.to_dense().reshape(s[0], -1).numpy()
    else:
        Xt = testPhi
        s = Xt.shape
    X_test = Xt
    y_test = testASD
    y_test = np.reshape(y_test, s[0])

    # subset features
    if 'min' in fselect:
        cols = X.astype(bool).sum(axis=0) > fmin
        X = X[:, cols]
        X_test = X_test[:, cols]
    if 'max' in fselect:
        cols = X.astype(bool).sum(axis=0) < fmax
        X = X[:, cols]
        X_test = X_test[:, cols]
    if 'thresh' in fselect:
        X[X < thresh] = 0
        X_test[X_test < thresh] = 0

    # rescale
    if sparse.issparse(X):
        scaler = StandardScaler(with_mean=False)
    else:
        scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    X_test = scaler.transform(X_test)

    if 'MI' in fselect:
        model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y)
        X = model.transform(X)
        X_test = model.transform(X_test)
    elif 'PCA' in fselect:
        model = PCA(n_components=nfeat).fit(X)
        X = model.transform(X)
        X_test = model.transform(X_test)
    elif 'reg' in fselect:
        model = SelectFpr(f_classif, alpha=a).fit(X, y)
        X = model.transform(X)
        X_test = model.transform(X_test)
    elif 'kbest' in fselect:
        model = SelectKBest(f_classif, k=nfeat).fit(X, y)
        X = model.transform(X)
        X_test = model.transform(X_test)

    # fit model
    model = classifier.fit(X, y)
    """
    if i == 0:
        coeffs = np.array(model.coef_).transpose()
    else:
        coeffs = np.c_[coeffs, np.array(model.coef_).transpose()]
    """

    # Compute accuracy for validation set
    y_hat = model.predict(X_test)
    acc = sum(y_hat == y_test) / len(y_test)

    # Compute accuracy for training set
    y_hat = model.predict(X)
    acc_tr = sum(y_hat == y) / len(y)

    # pd.DataFrame(model.coef_).to_csv('data/cancer_coef_' + str(i) + '.csv')

    #np.savetxt("data/cancer_coeffs.csv", coeffs, delimiter=",")

    return acc, acc_tr
예제 #4
0
kmean = KMeans(n_clusters=3,max_iter = 1000).fit(data)
cluster = kmean.predict(data)
data['Cluster'] = cluster

# using my algorithm (chi method) to find the best 5 features
y = result['Life expectancy at birth (years)']
x = data
new = SelectKBest(chi2, k=3).fit_transform(x,y)
new = pd.DataFrame(new)
a = new
b = classlabel
#T rain the knn model and produce the accuracy socre
X_train, X_test, y_train, y_test = train_test_split(new,classlabel, train_size=(2/3), test_size=(1/3), random_state=100)
new = neighbors.KNeighborsClassifier(n_neighbors=5)
new.fit(X_train, y_train)
new_pred=new.predict(X_test)
print("Accuracy of feature engineering: "+ str(round(accuracy_score(y_test, new_pred)*100,3))+"%")

################# PCA #############################

#sperating the source and target into two table
pcakeys = [str(result) for result in pcadata.keys()]
pcafeacture = pcadata[pcakeys[1:]]
pcatarget = pcadata[pcakeys[0]]

#pca normalization, since pca can be incflacted by scale
idf=pd.DataFrame(imp.fit_transform(pcafeacture))
idf.columns=pcafeacture.columns
idf.index=pcafeacture.index
pcafeacture = idf
scaler = preprocessing.StandardScaler().fit(pcafeacture)
예제 #5
0
def acc(classifier,
        fname,
        yfname=None,
        splits=10,
        fselect='min',
        root='./data/',
        nfeat=100,
        fmin=0,
        fmax=1000,
        a=.05,
        thresh=0):

    acc = []
    acc_tr = []
    # coeffs = []

    # load data
    mdict = scipy.io.loadmat(root + fname)  # import dataset from matlab
    if yfname is None:
        ymdict = mdict
    else:
        ymdict = scipy.io.loadmat(root + yfname)  # import dataset from matlab
    phi = mdict.get('phi')
    testPhi = mdict.get('testPhi')
    asd = ymdict.get('cvTrainASD')
    testASD = ymdict.get('cvTestASD')

    i = 0
    for i in range(0, splits):

        X = phi[(i, 0)]
        s = X.shape
        if len(s) == 3:
            X = np.reshape(X, [s[0], s[1] * s[2]])
        else:
            X = np.reshape(X, [s[0], s[1]])
        y = asd[(i, 0)]
        y = np.reshape(y, s[0])
        X_test = testPhi[(i, 0)]
        s = X_test.shape
        if len(s) == 3:
            X_test = np.reshape(X_test, [s[0], s[1] * s[2]])
        else:
            X_test = np.reshape(X_test, [s[0], s[1]])
        y_test = testASD[(i, 0)]
        y_test = np.reshape(y_test, s[0])

        # add zero column if dims don't match
        dif = X.shape[1] - X_test.shape[1]
        if dif > 0:
            z = np.zeros((X_test.shape[0], dif))
            X_test = np.append(X_test, z, 1)

        # subset features
        if 'min' in fselect:
            cols = X.astype(bool).sum(axis=0) > fmin
            X = X[:, cols]
            X_test = X_test[:, cols]
        if 'max' in fselect:
            cols = X.astype(bool).sum(axis=0) < fmax
            X = X[:, cols]
            X_test = X_test[:, cols]
        if 'thresh' in fselect:
            X[X < thresh] = 0
            X_test[X_test < thresh] = 0

        # rescale
        if sparse.issparse(X):
            scaler = StandardScaler(with_mean=False)
        else:
            scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_test = scaler.transform(X_test)

        if 'MI' in fselect:
            model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif 'PCA' in fselect:
            model = PCA(n_components=nfeat).fit(X)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif 'reg' in fselect:
            model = SelectFpr(f_classif, alpha=a).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif 'kbest' in fselect:
            model = SelectKBest(f_classif, k=nfeat).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)

        # fit model
        model = classifier.fit(X, y)
        """
        if i == 0:
            coeffs = np.array(model.coef_).transpose()
        else:
            coeffs = np.c_[coeffs, np.array(model.coef_).transpose()]
        """

        # Compute accuracy for validation set
        y_hat = model.predict(X_test)
        acc.append(sum(y_hat == y_test) / len(y_test))

        # Compute accuracy for training set
        y_hat = model.predict(X)
        acc_tr.append(sum(y_hat == y) / len(y))

        # pd.DataFrame(model.coef_).to_csv('data/cancer_coef_' + str(i) + '.csv')

        i += 1

    #np.savetxt("data/cancer_coeffs.csv", coeffs, delimiter=",")

    results = stats.ttest_1samp(acc, popmean=755 / 2126)
    p_val = results[1]

    results = stats.ttest_1samp(acc_tr, popmean=755 / 2126)
    p_val_tr = results[1]

    return np.mean(acc), np.std(acc), p_val, np.mean(acc_tr), np.std(
        acc_tr), p_val_tr
#f regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
x = df.drop(['Sales', 'Sales_Bin'], axis='columns')
y = df.Sales
model = SelectKBest(score_func=f_regression, k=5)
results = model.fit(x, y)
results.scores_
scores = pd.DataFrame(results.scores_, index=x.columns)
results.pvalues_
scores.sort_values(by=0, ascending=True)
'''
최종적으로 r squared 가 0.915인 모델로 선정
'''
#residual plot
y_pred = model.predict(x)
residual = y - y_pred
std_residual = residual / np.std(residual)
plt.scatter(y_pred, std_residual)
plt.grid()

#remove index 9 record
df = pd.read_excel('cravens.xlsx')
df = df.drop(9)

x = df[['Time', 'Poten', 'AdvExp', 'Share', 'Change']]
y = df.Sales
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.summary()
'''