Пример #1
0
def svd_dimensionality_reduction_TruncatedSVD(A, dimension = 2):
    print("********** scikit-learn provides a TruncatedSVD class *************")
    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=dimension)
    svd.fit(A)
    #print("svd \n",svd)
    result = svd.transform(A)
    #print("Transform Matrix is \n ",result)
    return result
def tSNE(fileName):
    cnx = sqlite3.connect('data/10Feature.db')
    dforigtrain = pandas.read_csv(fileName)
    print(dforigtrain.shape)
    print dforigtrain.head()
    dforigtrain.rename(
        columns=lambda x: '_'.join([x.strip() for x in x.lower().split()]),
        inplace=True)
    df = dforigtrain[[
        c for c in dforigtrain.columns.values.tolist() if c != 'orig_set'
    ]]
    print(df.shape)

    # Write to DB to allow easier loading later
    df.to_sql('df_clean', cnx, if_exists='replace', index=None)

    df = pd.read_sql('select * from df_clean', cnx)

    print(df.shape)
    scaler = StandardScaler().fit(df.iloc[:, 2:])

    dfs = pd.DataFrame(scaler.transform(df.iloc[:, 2:]),
                       index=df.index,
                       columns=df.columns[2:])

    # Commented part helps in creating SVD
    '''u, s, vt = svd(dfs)

    ax = pd.Series(s).plot(figsize=(10,3), logy=True)

    print('{} SVs are NaN'.format(np.isnan(s).sum()))
    print('{} SVs less than 1e-12'.format(len(s[s < 1e-12])))
    
    plt.show()'''

    # from here the Truncated SVD , this is mostly helpful in image data set where reducing dimensions is mostl;y possibel . In our case, every feature was contributing . Hence no Truncation is possible

    ncomps = 19

    svd = TruncatedSVD(algorithm='randomized', n_components=ncomps)

    svd_fit = svd.fit(dfs)

    Y = svd.fit_transform(dfs)

    ax = pd.Series(svd_fit.explained_variance_ratio_.cumsum()).plot(
        kind='line', figsize=(10, 3))

    print(
        'Variance preserved by first ' + str(ncomps) +
        ' components == {:.2%}'.format(
            svd_fit.explained_variance_ratio_.cumsum()[-1]))

    plt.show()

    dfsvd = pd.DataFrame(Y,
                         columns=['c{}'.format(c) for c in range(ncomps)],
                         index=df.index)

    dfsvd.to_sql('df_svd', cnx, if_exists='replace', index=None)

    dfsvd = pd.read_sql('select * from df_svd', cnx)

    print(dfsvd.shape)

    svdcols = [c for c in dfsvd.columns if c[0] == 'c']

    df = pd.read_sql('select * from df_clean', cnx)

    print(dfsvd.shape)

    print(dfsvd.head())

    plotdims = 8
    ploteorows = 1
    dfsvdplot = dfsvd[svdcols].iloc[:, :plotdims]
    dfsvdplot['class'] = df['class']
    #interactive(plot_3d_scatter, A=fixed(dfsvd), elevation=30, azimuth=120)

    ax = sns.pairplot(dfsvdplot.iloc[::ploteorows, :], hue='class', size=1.8)

    plt.show()

    #rowsubset = [10,20,40,80,160,320,640, 1280, 1900]
    tsne = TSNE(n_components=2, random_state=0)
    '''runs = np.empty((len(rowsubset),1))

    for i, rows in enumerate(rowsubset):
        t0 = time()
        Z = tsne.fit_transform(dfsvd.iloc[:rows,:][svdcols])
        runs[i] = time() - t0

    ax = pd.DataFrame(runs, index=rowsubset).plot(kind='bar', logy=False, figsize=(10,4))
    plt.show()
    
    '''
    Z = tsne.fit_transform(dfsvd[svdcols])
    dftsne = pd.DataFrame(Z, columns=['x', 'y'], index=dfsvd.index)
    ax = sns.lmplot('x',
                    'y',
                    dftsne,
                    fit_reg=False,
                    size=8,
                    scatter_kws={
                        'alpha': 0.7,
                        's': 60
                    })
    ax.axes.flat[0].set_title(
        'Scatterplot of a 50D dataset reduced to 2D- Unsupervised')

    #plt.show()

    dftsne['class'] = df['class']

    g = sns.lmplot('x',
                   'y',
                   dftsne,
                   hue='class',
                   fit_reg=False,
                   size=8,
                   scatter_kws={
                       'alpha': 0.7,
                       's': 60
                   })

    g.axes.flat[0].set_title(
        'Scatterplot of a 50D dataset reduced to 2D -Supervised')

    plt.show()
Пример #3
0
              [21, 22, 23, 24, 25, 26, 27, 28, 29, 30]])

# SVD
U, s, VT = svd(A)
#Calculate how many singular values we have to take into account -> 80-90%

sumv = np.sum(s) * 0.85

cumsumv = np.cumsum(s)


def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]


nearests = find_nearest(cumsumv, sumv)

itemindexs = np.where(cumsumv == nearests)
print(itemindexs)

## n_elements = valueof itemindexs

svd = TruncatedSVD(n_components=3)
svd.fit(A)
result = svd.transform(A)
np.asarray(result)

print(result)
Пример #4
0
# transform
T = U.dot(Sigma)
print(T)
T = A.dot(VT.T)
print(T)

from numpy import array
from sklearn.decomposition import TruncatedSVD
# define array
A = array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
           [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
           [21, 22, 23, 24, 25, 26, 27, 28, 29, 30]])
print(A)
# svd
svd = TruncatedSVD(n_components=2)
svd.fit(A)
result = svd.transform(A)
print(result)

B = svd.inverse_transform(result)

C = array([[31, 32, 53, 44, 45, 66, 77, 48, 29, 210],
           [11, 12, 13, 134, 15, 16, 17, 18, 19, 20],
           [21, 22, 23, 24, 25, 126, 27, 28, 29, 30]])
print(C)

result = svd.transform(C)
print(result)

B = svd.inverse_transform(result)
Пример #5
0
    from scipy.linalg import svd
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

    # train = "../../test/03-train-input.txt"
    train = "../../data/titles-en-train.labeled"
    test = "../../data/titles-en-test.labeled"
    train_X, train_y, train_v = load_data(train)
    test_X, test_y, test_v = load_data(test)
    # cv = CountVectorizer()
    # cv.fit(list(map(lambda x: " ".join(x), train_X)))
    # train_X, test_X = get_cntvec(cv, train_X), get_cntvec(cv, train_X)
    v = TfidfVectorizer(max_df=0.8)
    v.fit(list(map(lambda x: " ".join(x), train_X)))
    train_X, test_X = get_tdidf(v, train_X), get_tdidf(v, test_X)
    svd = TruncatedSVD(n_components=200, random_state=3939)
    svd.fit(train_X)
    train_X, test_X = get_reduced(svd, train_X), get_reduced(svd, test_X)
    train_X, train_y = np.array(train_X).astype(
        np.float32), np.array(train_y).astype(np.int32)
    test_X, test_y = np.array(test_X).astype(
        np.float32), np.array(test_y).astype(np.int32)
    # print(train_X, train_y)
    model = MLP(len(train_X[0]), len(train_X[0]) // 2, 1)
    # optimizer = SGD(lr=0.01)
    optimizer = Adam(lr=0.01)
    batch_size = 32
    max_epoch = 50
    train_model(model, optimizer, train_X, train_y, batch_size, max_epoch)
    pred_y = model.predict(test_X)
    pred_y = np.tanh(pred_y)
    print(accuracy_score(test_y, activate(pred_y)))