Exemplo n.º 1
0
def get_embedding(X, y, type_embeding):
    n_neighbors = 30
    X_projected = None

    if type_embeding == "Random":
        rp = random_projection.SparseRandomProjection(n_components=2,
                                                      random_state=42)
        X_projected = rp.fit_transform(X)

    elif type_embeding == "PCA":
        X_projected = decomposition.TruncatedSVD(n_components=2).fit_transform(
            X)

    elif type_embeding == "LDA":
        X2 = X.copy()
        X2.flat[::X.shape[1] + 1] += 0.01  # Make X invertible
        X_projected = discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=2).fit_transform(X2, y)

    elif type_embeding == "Isomap":
        X_projected = manifold.Isomap(n_neighbors,
                                      n_components=2).fit_transform(X)

    elif type_embeding == "LLE":
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                              method='standard')
        X_projected = clf.fit_transform(X)

    elif type_embeding == "mLLE":
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                              method='modified')
        X_projected = clf.fit_transform(X)

    elif type_embeding == "hLLE":
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                              method='hessian')
        X_projected = clf.fit_transform(X)

    elif type_embeding == "ltsa":
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                              method='ltsa')
        X_projected = clf.fit_transform(X)

    elif type_embeding == "MDS":
        clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
        X_projected = clf.fit_transform(X)

    elif type_embeding == "RF":
        hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
                                               max_depth=5)

        X_transformed = hasher.fit_transform(X)
        pca = decomposition.TruncatedSVD(n_components=2)
        X_projected = pca.fit_transform(X_transformed)

    elif type_embeding == "Spectral":
        embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
                                              eigen_solver="arpack")
        X_projected = embedder.fit_transform(X)

    elif type_embeding == "T-SNE":
        tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
        X_projected = tsne.fit_transform(X)
    else:
        print("""Valid options are:
         Random    => Random Projections
         PCA       => Principal Component Analysis
         LDA       => Linear Discriminant Analysis
         Isomap    => Isomap
         LLE       => Locally Linear Embedding
         mLLE      => Modified Locally Linear Embedding
         hLLE      => Hessian Locally Linear Embedding
         ltsa      => Locally Linear Embedding (ltsa)
         MDS       => Multidimensional Scaling
         RF        => Random Forest Embeding
         Spectral  => Spectral Embeding
         T-SNE     => T-SNE    """)

    return X_projected
Exemplo n.º 2
0
corpus.loc[:, "review"] = corpus.review.apply(clean_text)

# collect only the text in review column
corpus = corpus.review.values

# initialize TfidfVectorizer
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

# fit the vectorizer to corpus
tfv.fit(corpus)

# transform the corpus using tfidf
corpus_transformed = tfv.transform(corpus)

# initialize SVD with 10 components
svd = decomposition.TruncatedSVD(n_components=10)

# fit SVD
corpus_svd = svd.fit(corpus_transformed)

# choose first sample and create a dictionary
# of feature names and their scores from svd
# you can change the sample_index variable to
# get dictionary for any other sample
sample_index = 0
feature_scores = dict(
    zip(tfv.get_feature_names(), corpus_svd.components_[sample_index]))

# once we have the dictionary, we can now
# sort it in decreasing order and get the
# top N topics
Exemplo n.º 3
0
from sklearn import cluster

n = 300000
X = np.fromfile('train_feat', dtype=np.float32,
                count=n * 2048).reshape(-1, 2048)
X_test = np.fromfile('test_feat', dtype=np.float32,
                     count=500000 * 2048).reshape(-1, 2048)
print(X.shape, X_test.shape)
#XX = np.concatenate([X, X_test])
#print (XX.shape)
mean = X.mean(axis=0)
print(mean.shape)
X -= mean
X_test -= mean

pca = decomposition.TruncatedSVD(n_components=128)

pca.fit(X)
X = pca.transform(X).astype(np.float32)
X.tofile('train_feat_128')
X_test = pca.transform(X_test).astype(np.float32)
X_test.tofile('test_feat_128')
#print (X.shape)
#
#print (pca.explained_variance_ratio_.sum())
#print (pca.components_.dtype)
#
#print (pca.components_.shape)
#pca.mean_.tofile('mean')
#pca.components_.tofile('comp')
Exemplo n.º 4
0
    fit_train_transform_result = pca.fit_transform(x_train_tfidf.toarray())
    fit_test_transform_result = pca.transform(x_test_tfidf.toarray())
    fit_train_transform_result.shape
    fit_test_transform_result.shape
    run_model(fit_train_transform_result, y_train, fit_test_transform_result,
              y_test)

    pca = decomposition.PCA(n_components=i)
    fit_train_transform_result = pca.fit_transform(x_train_extended)
    fit_test_transform_result = pca.transform(x_test_extended)
    fit_train_transform_result.shape
    fit_test_transform_result.shape
    run_model(fit_train_transform_result, y_train, fit_test_transform_result,
              y_test)

# trunc SVD #
from sklearn import decomposition, metrics, model_selection
import numpy as np

ks = [10, 20, 30, 50, 75, 100, 150, 250, 300]
ttrain = [x_train_tfidf, x_train_extended]
ttest = [x_test_tfidf, x_test_extended]

for tt in range(2):
    for kk in ks:
        tSVD = decomposition.TruncatedSVD(n_components=kk, random_state=2017)
        xt_train = tSVD.fit_transform(ttrain[tt])
        xt_test = tSVD.transform(ttest[tt])

        run_model(xt_train, y_train, xt_test, y_test)
Exemplo n.º 5
0
print(f1_score(yvalid, predictions, average=None))
print("Accuracy")
print(accuracy_score(yvalid, predictions))
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict(xvalid_ctv)
print("f1 Score")
print(f1_score(yvalid, predictions, average='macro'))
print("f1 Score Individual")
print(f1_score(yvalid, predictions, average=None))
print("Accuracy")
print(accuracy_score(yvalid, predictions))

# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

# Fitting a simple SVM
clf = SVC(C=1.0)
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict(xvalid_svd_scl)
print("f1 Score")
Exemplo n.º 6
0
                                               max_depth=5).fit_transform(data)
    print_results("tree kmeans", labels, compute_kmeans(tree_embedding_data))

    0 / 0
    srp_emb_data = random_projection.SparseRandomProjection(
        n_components=20, random_state=42).fit_transform(data)
    print_results("SparseRandomProjection kmeans", labels,
                  compute_kmeans(srp_emb_data))

    iso_emb_data = manifold.Isomap(30, n_components=2).fit_transform(data)
    print_results("iso kmeans", labels, compute_kmeans(iso_emb_data))

    # lle_emb_data = manifold.LocallyLinearEmbedding(10, n_components=2, method='ltsa').fit_transform(data)
    # print_results("lle kmeans", labels, compute_kmeans(lle_emb_data))

    svd = decomposition.TruncatedSVD(n_components=2).fit_transform(data)
    print_results("svd kmeans", labels, compute_kmeans(svd))

    tsne_emb_data = manifold.TSNE(n_components=3, init="pca",
                                  random_state=17).fit_transform(data)
    print_results("tsne kmeans", labels, compute_kmeans(tsne_emb_data))

    0 / 0

    # print_results("tree kmeans+rdc", labels, compute_kmeans_rdc(tree_embedding_data))

    mds_embedding_data = manifold.MDS(n_components=10, n_init=1, max_iter=100)
    print_results("tree kmeans", labels, compute_kmeans(mds_embedding_data))
    print_results("tree kmeans+rdc", labels,
                  compute_kmeans_rdc(mds_embedding_data))
Exemplo n.º 7
0
 def __init__(self, **kwargs):
     super().__init__()
     self.truncated_svd = decomposition.TruncatedSVD(**kwargs)
Exemplo n.º 8
0
def main():
    class name:
        """ algebra of LaTeX names of representations and transformations """
        def __init__(self, n, is_compound=False):
            self.n = n
            self.is_compound = is_compound

        def bracket(self):
            if not self.is_compound:
                return self
            return name("$(" + self.n.strip("$") + ")$",
                        is_compound=self.is_compound)

        def __mul__(self, other):
            return name(self.n.rstrip("$") + "\cdot" + other.n.lstrip("$"),
                        is_compound=True)

        def __or__(self, other):
            return name(self.bracket().n.rstrip("$") + " | " +
                        other.bracket().n.lstrip("$"),
                        is_compound=False)

        def __call__(self, *params):
            return name(self.n % params, is_compound=self.is_compound)

        def __hash__(self):
            return hash(self.n)

        def __eq__(self, other):
            return self.n == other.n

        def __ne__(self, other):
            return not self.__eq__(other)

        def __str__(self):
            return self.n

        def __repr__(self):
            return "name({})".format(self.n)

    class r:
        bow = name("$\mathrm{Bow}$")
        bow_norm = name("$\mathrm{BowNorm}$")
        sentiment = name("$\mathrm{Sentiment}$")
        ratios = name("$\mathrm{Ratios}$")
        svd = name("$\mathrm{SVD}(%s)$")
        lda = name("$\mathrm{LDA}(%s)$")
        diff = name("$\mathrm{Diff}$")
        ewm = name("$\mathrm{Ewm}(%s)$")
        user_stats = name("$\mathrm{UserStats}$")
        day = name("$\mathrm{Day}$")
        returns = name("$\mathrm{Returns}_t$")
        speed = name("$\mathrm{Speed}$")

    class c:
        lr = name("$\mathrm{LR}$")
        xgb = name("$\mathrm{XGB}$")

    class pair:
        def __init__(self, clf, rep):
            self.clf = clf
            self.rep = rep
            self.str = "$\\langle {}, {} \\rangle$".format(
                self.clf.n.strip("$"), self.rep.n.strip("$"))

        def __str__(self):
            return self.str

        def __repr__(self):
            return str(self)

        def __float__(self):
            raise ValueError

        def __hash__(self):
            return hash((self.clf, self.rep))

        def __eq__(self, other):
            return (self.clf, self.rep) == (other.clf, other.rep)

        def __ne__(self, other):
            return not self.__eq__(other)

    df_price = models.load_price()
    df_ug, widx = models.load_unigrams()

    reps = {}

    reps[r.bow] = df_ug
    reps[r.bow_norm] = np.log1p(df_ug).div(np.log1p(df_ug).apply(
        np.linalg.norm, axis=1),
                                           axis=0)
    reps[r.bow_norm * r.diff] = reps[r.bow_norm].diff().dropna()
    reps[r.bow_norm * r.svd(32)] = models.sklearn_transform_in_sample(
        decomp.TruncatedSVD(n_components=32), reps[r.bow_norm])
    reps[r.bow_norm * r.diff * r.svd(32)] = models.sklearn_transform_in_sample(
        decomp.TruncatedSVD(n_components=32), reps[r.bow_norm * r.diff])
    reps[r.bow_norm * r.svd(32) * r.ewm(55)] = reps[r.bow_norm *
                                                    r.svd(32)].ewm(55).mean()
    reps[r.bow_norm * r.diff * r.svd(32) *
         r.ewm(120)] = reps[r.bow_norm * r.diff * r.svd(32)].ewm(120).mean()

    reps[r.ratios] = models.Ratios.load().get()
    reps[r.ratios * r.diff] = reps[r.ratios].diff().dropna()
    reps[r.ratios * r.svd(32)] = models.sklearn_transform_in_sample(
        decomp.TruncatedSVD(n_components=32), reps[r.ratios])
    reps[r.ratios * r.diff * r.svd(32)] = models.sklearn_transform_in_sample(
        decomp.TruncatedSVD(n_components=32), reps[r.ratios * r.diff])
    reps[r.ratios * r.svd(32) * r.ewm(30)] = reps[r.ratios *
                                                  r.svd(32)].ewm(30).mean()

    reps[r.sentiment] = models.load_sentiment().fillna(0)
    reps[r.sentiment * r.diff] = reps[r.sentiment].diff()
    reps[r.sentiment * r.ewm(90)] = reps[r.sentiment].ewm(90).mean()
    reps[r.sentiment | (r.sentiment * r.ewm(90))] = util.join(
        reps[r.sentiment], reps[r.sentiment * r.ewm(90)])

    reps[r.lda(50)] = df_lda  #models.load_lda()
    reps[r.lda(50) * r.ewm(90)] = reps[r.lda(50)].ewm(90).mean()
    reps[(r.lda(50) * r.ewm(90)) | r.lda(50)] = util.join(
        reps[r.lda(50)], reps[r.lda(50) * r.ewm(90)])

    df_user_stats = np.log1p(models.load_user_stats())
    reps[r.user_stats] = df_user_stats - df_user_stats.shift(56)

    df_time = pd.DataFrame(dict(time=np.arange(df_price.shape[0])),
                           index=df_price.index)

    reps[(r.lda(50) * r.ewm(90)) | (r.ratios * r.svd(32)) |
         (r.sentiment * r.ewm(90)) | r.day] = util.join(
             reps[r.lda(50) * r.ewm(90)], reps[r.ratios * r.svd(32)],
             reps[r.sentiment * r.ewm(90)], df_time)

    reps[(r.lda(50) * r.ewm(90)) | (r.ratios * r.svd(32)) |
         (r.sentiment * r.ewm(90)) | r.user_stats | r.day] = util.join(
             reps[r.lda(50) * r.ewm(90)], reps[r.ratios * r.svd(32)],
             reps[r.sentiment * r.ewm(90)], reps[r.user_stats], df_time)

    new_rep = {}
    new_rep[(r.lda(50) * r.ewm(90)) | (r.ratios * r.svd(32)) |
            (r.sentiment * r.ewm(90)) | r.user_stats | r.returns
            | r.day] = util.join(reps[r.lda(50) * r.ewm(90)],
                                 reps[r.ratios * r.svd(32)],
                                 reps[r.sentiment * r.ewm(90)],
                                 reps[r.user_stats],
                                 df_price[["log_return", "up_down"]], df_time)

    # df_ratios_speed = reps[r.ratios * r.svd(32)]\
    #                   .diff()\
    #                   .dropna()\
    #                   .apply(np.linalg.norm, axis=1)\
    #                   .to_frame("ratio_speed")
    # df_svd_speed = reps[r.bow_norm * r.svd(32)]\
    #                .diff()\
    #                .dropna()\
    #                .apply(np.linalg.norm, axis=1)\
    #                .to_frame("svd_speed")

    in_sample = slice(data_config.date_begin, data_config.date_is_end)
    in_sample_recent = slice(data_config.date_turning_point,
                             data_config.date_is_end)
    out_of_sample = slice(data_config.date_oos_begin,
                          data_config.date_oos_end - dt.timedelta(days=1))

    df_price_is, df_price_oos = df_price.ix[in_sample], df_price.ix[
        out_of_sample]

    clfs = {
        c.lr:
        lambda: lm.LogisticRegression(),
        c.xgb:
        lambda: xgboost.XGBClassifier(reg_lambda=2, max_depth=3, subsample=.5),
    }

    results_cv = []
    for rep_name, rep in sorted(new_rep.items(), key=lambda x: x[0]):
        for clf_name, clf_f in clfs.items():
            if rep_name == r.bow_norm or rep_name == r.bow_norm * r.diff or rep_name == r.bow and clf_name == c.xgb:
                continue
            print rep_name.n, clf_name.n
            acc, roc_auc, matthews = zip(*training.cv_test(
                clf_f(), rep.ix[in_sample], df_price_is, k=100))
            df_results = pd.DataFrame(
                dict(acc=acc, roc_auc=roc_auc, matthews=matthews))
            df_results["rep"] = rep_name
            df_results["clf"] = clf_name
            results_cv.append(df_results)

    results_cv_all = pd.concat(results_cv)

    results_cv_all["rep_clf_name"] = [
        pair(a, b) for a, b in zip(results_cv_all.clf, results_cv_all.rep)
    ]

    rank_by = "acc"
    top_3_cv = results_cv_all\
               .query("rep != 'user_stats'")\
               .groupby("rep_clf_name")\
               .quantile(.1)\
               .sort_values(rank_by, ascending=False)\
               .head(3)

    bottom_3_cv = results_cv_all\
               .query("rep != 'user_stats'")\
               .groupby("rep_clf_name")\
               .quantile(.1)\
               .sort_values(rank_by)\
               .head(3)

    plots.plot_best_worst_cv(results_cv_all, top_3_cv, bottom_3_cv, rank_by)

    df_soft_returns = pd.DataFrame(index=df_price_oos.index)
    df_hard_returns = pd.DataFrame(index=df_price_oos.index)

    results_oos = []

    for pair_clf_rep in top_3_cv.index.values:
        rep, clf_f = reps[pair_clf_rep.rep], clfs[pair_clf_rep.clf]
        pair_name = str(pair_clf_rep)
        [acc, roc_auc,
         matthews], [realized_returns_hard, realized_returns_soft
                     ] = training.oos_test(clf_f(), rep.ix[in_sample],
                                           rep.ix[out_of_sample], df_price_is,
                                           df_price_oos)

        results_oos.append([pair_name, acc, roc_auc, matthews])

        df_soft_returns[pair_name] = realized_returns_soft
        df_hard_returns[pair_name] = realized_returns_hard

    results_oos_all = pd.DataFrame(
        results_oos, columns=["rep", "acc", "roc_auc", "matthews"])

    oos_sharpe = (df_hard_returns.diff().mean() /
                  df_hard_returns.diff().std()).to_frame("sharpe")
    oos_log_return = df_hard_returns.ix[-1].to_frame("log_return")

    df_random_classifier = random_returns.random_returns_stats(df_price_oos,
                                                               n=1000)
    df_buy_hold = pd.DataFrame(
        dict(log_return=[df_price_oos.log_return.sum()],
             sharpe=[
                 df_price_oos.log_return.mean() /
                 df_price_oos.log_return.std()
             ]))
    df_oos = results_oos_all.join(oos_sharpe.join(oos_log_return), on="rep")

    plots.plot_returns(df_price_oos, df_hard_returns)
    random_returns.plot_random_returns(
        df_price_oos, random_returns.random_returns_stats(df_price_oos))
Exemplo n.º 9
0
Height, Width = Base_img.shape[0], Base_img.shape[1]

f_original = open(Picture)
f_original.seek(0, os.SEEK_END)
Base_weight = f_original.tell()
f_original.close()

Range = int(
    Max_conversion_rate * Height /
    100) if int(Max_conversion_rate * Height /
                100) < Width else Width - 1 if Height == Width else Width

RGB_Encoded_array = []
RGB_Compressed_array = []

svd = decomposition.TruncatedSVD(n_components=Range if Range != 0 else 1)
for RGB_each_array in range(3):
    RGB = Base_img[:, :, RGB_each_array]
    RGB_simple_array = svd.fit_transform(RGB)
    RGB_Compressed_array.append(RGB_simple_array)
    RGB_Encoded_array.append((svd.inverse_transform(RGB_simple_array)))

if Normalization == 1:
    Image_encoded = normalization(np.dstack(RGB_Encoded_array))
    Image_coded = normalization(np.dstack(RGB_Compressed_array))
    RGB_Encoded_array = normalization(RGB_Encoded_array)
else:
    Image_encoded = np.dstack(RGB_Encoded_array)
    Image_coded = np.dstack(RGB_Compressed_array)
    RGB_Encoded_array = RGB_Encoded_array
Exemplo n.º 10
0
def plot_other_manifold(X, y, n_neighbors, n_estimators=00,
                        max_depth=5, random_state=0):
    # ----------------------------------------------------------------------
    # Modified Locally linear embedding of the digits dataset
    print("Computing modified LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='modified')
    t0 = time()
    X_mlle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_mlle, y,
                   "Modified Locally Linear Embedding of the digits (time %.2fs)" %
                   (time() - t0))

    # -------------------------------------------------------------
    # HLLE embedding of the digits dataset
    print("Computing Hessian LLE embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='hessian')
    t0 = time()
    X_hlle = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_hlle, y,
                   "Hessian Locally Linear Embedding of the digits (time %.2fs)" %
                   (time() - t0))

    # --------------------------------------------------------------------
    # LTSA embedding of the digits dataset
    print("Computing LTSA embedding")
    clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                          method='ltsa')
    t0 = time()
    X_ltsa = clf.fit_transform(X)
    print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
    plot_embedding(X_ltsa, y,
                   "Local Tangent Space Alignment of the digits (time %.2fs)" %
                   (time() - t0))

    # ----------------------------------------------------------------------
    # Random Trees embedding of the digits dataset
    print("Computing Totally Random Trees embedding")
    hasher = ensemble.RandomTreesEmbedding(n_estimators=n_estimators,
                                           random_state=random_state,
                                           max_depth=max_depth)
    t0 = time()
    X_transformed = hasher.fit_transform(X)
    pca = decomposition.TruncatedSVD(n_components=2)
    X_reduced = pca.fit_transform(X_transformed)

    plot_embedding(X_reduced, y,
                   "Random forest embedding of the digits (time %.2fs)" %
                   (time() - t0))

    # ----------------------------------------------------------------------
    # Spectral embedding of the digits dataset
    print("Computing Spectral embedding")
    embedder = manifold.SpectralEmbedding(n_components=2,
                                          random_state=random_state,
                                          eigen_solver="arpack")
    t0 = time()
    X_se = embedder.fit_transform(X)

    plot_embedding(X_se, y,
                   "Spectral embedding of the digits (time %.2fs)" %
                   (time() - t0))
Exemplo n.º 11
0
#%% Tests regarding flameplot
import flameplot as flameplot
import numpy as np
from sklearn import (manifold, decomposition)

# %% Load data
X, y = flameplot.import_example()

# %% PCA
X_pca_50 = decomposition.TruncatedSVD(n_components=50).fit_transform(X)
X_pca_2 = decomposition.TruncatedSVD(n_components=2).fit_transform(X)
# tSNE
X_tsne = manifold.TSNE(n_components=2, init='pca').fit_transform(X)
# Random
X_rand = np.c_[np.random.permutation(X_pca_2[:, 0]),
               np.random.permutation(X_pca_2[:, 1])]

# %% Scatter
flameplot.scatter(X_pca_2[:, 0], X_pca_2[:, 1], label=y, title='PCA')
flameplot.scatter(X_tsne[:, 0], X_tsne[:, 1], label=y, title='tSNE')
flameplot.scatter(X_rand[:, 0], X_rand[:, 1], label=y, title='Random')

# %% Compare PCA(50) vs. tSNE
scores = flameplot.compare(X_pca_50, X_tsne, n_steps=25)
fig = flameplot.plot(scores, xlabel='PCA (50d)', ylabel='tSNE (2d)')
# Compare PCA(2) vs. tSNE
scores = flameplot.compare(X_pca_2, X_tsne, n_steps=25)
fig = flameplot.plot(scores, xlabel='PCA (2d)', ylabel='tSNE (2d)')
# Compare random vs. tSNE
scores = flameplot.compare(X_rand, X_tsne, n_steps=25)
fig = flameplot.plot(scores, xlabel='Random (2d)', ylabel='tSNE (2d)')
def generate_transformers(x,
                          dataset,
                          global_dir,
                          min_variance=10,
                          additional_scale_tsvd=1):
    """
  This function returns a dictionary with callables for a given dataset.
  """

    transform_functions = {
        'vae': (lambda x: transform_vae(x, VAE_net)),
        'pca': (lambda x: transform_pca(x, pca, var_pca)),
        'tsvd': (lambda x: transform_tsvd(x, tsvd)),
        'kpca': (lambda x: transform_kpca(x, kpca)),
        'spca': (lambda x: transform_spca(x, spca)),
        'iso': (lambda x: transform_iso(x, iso)),
        'lle': (lambda x: transform_lle(x, lle)),
    }
    """
  Note that below, we could have dynamically generated most transformer
  functions. However, doing so would potentially lose overview, and we
  do not have to optimize for efficiency here, while we actually have
  to preserve readability.
  """

    ################ Regular PCA ################

    pca = decomposition.PCA(n_components=2)
    var_pca = np.var(pca.fit_transform(
        x))  # We do this in one call, since we don't need latent_X for now

    # print(np.sum(pca.explained_variance_ratio_)) # Could be interesting to explain results with

    def transform_pca(x, pca, var_pca):
        return np.matmul(x, np.transpose(
            pca.components_)) / math.sqrt(var_pca) * math.sqrt(min_variance)

    ################ Truncated SVD ################
    tsvd = decomposition.TruncatedSVD(n_components=2,
                                      n_iter=7,
                                      random_state=42)
    var_tsvd = np.var(tsvd.fit_transform(x))

    def transform_tsvd(x, tsvd):
        return np.matmul(x, np.transpose(tsvd.components_)) / math.sqrt(
            var_tsvd) * math.sqrt(min_variance) * additional_scale_tsvd

    ################ Kernel PCA ################
    kpca = decomposition.KernelPCA(n_components=2,
                                   kernel="sigmoid",
                                   fit_inverse_transform=True,
                                   gamma=None,
                                   random_state=42)
    var_kpca = np.var(kpca.fit_transform(x))

    if 0. in kpca.lambdas_:  # KPCA with Sigmoid kernel does not work for this set
        del transform_functions['kpca']

    def transform_kpca(x, kpca):
        x = np.array(x)
        if len(x.shape) == 1:
            x = x.reshape(1, -1)
        return kpca.transform(x) / math.sqrt(var_kpca) * math.sqrt(
            min_variance)

    ################ Sparse PCA ################
    spca = decomposition.SparsePCA(n_components=2,
                                   alpha=0.0001,
                                   random_state=42,
                                   n_jobs=-1)
    var_spca = np.var(spca.fit_transform(x))

    def transform_spca(x, spca):
        return np.matmul(x, np.transpose(
            spca.components_)) / math.sqrt(var_spca) * math.sqrt(min_variance)

    ################ ISO ################
    iso = manifold.Isomap(n_neighbors=8, n_components=2, eigen_solver='dense')
    var_iso = np.var(iso.fit_transform(x))

    def transform_iso(x, iso):
        x = np.array(x)
        if len(x.shape) == 1:
            x = x.reshape(1, -1)
        return iso.transform(x) / math.sqrt(var_iso) * math.sqrt(min_variance)

    ################ LLE ################
    lle = manifold.LocallyLinearEmbedding(n_neighbors=8,
                                          n_components=2,
                                          eigen_solver='dense')
    var_lle = np.var(lle.fit_transform(x))

    def transform_lle(x, lle):
        x = np.array(x)
        if len(x.shape) == 1:
            x = x.reshape(1, -1)
        return lle.transform(x) / math.sqrt(var_lle) * math.sqrt(min_variance)

    ################ SCVIS VAE ################
    VAE_save_file = global_dir + "/results/vae_models/" + dataset + ".pt"

    if not os.path.isfile(VAE_save_file):
        # Auto-encoder needs to be trained on the model first
        print('Training new VAE model on %s dataset' % dataset)
        trainVAE(
            x, global_dir, dataset
        )  # normalizing using np.max(np.abs(x)) not necessary as it equals 1

    # Once trained, it loads existing model, also for reproducability
    VAE_model = torch.load(VAE_save_file)['model_state_dict']

    print('Loaded VAE model for %s dataset' % dataset)

    VAE_net = VAE(input_dim=x.shape[1], latent_dim=2)
    VAE_net.load_state_dict(VAE_model)
    VAE_net.eval()

    def transform_vae(x, VAE_net):
        x = np.array(x)
        if len(x.shape) == 1:
            x = x.reshape(1, -1)

        with torch.no_grad():
            x_batch = torch.from_numpy(x).float()
            encoder_mu, encoder_log_var = VAE_net.encoder(x_batch, p=1.0)
            batch_z = VAE_net.sampling(encoder_mu,
                                       encoder_log_var,
                                       batch_size=len(x),
                                       eval=True).numpy()

        return np.array(batch_z, dtype=float)

    return transform_functions
Exemplo n.º 13
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
Exemplo n.º 14
0
def test_singular_values():
    # Check that the TruncatedSVD output has the correct singular values

    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 80

    X = rng.randn(n_samples, n_features)
    dX = da.from_array(X, chunks=(n_samples // 2, n_features))

    apca = dd.TruncatedSVD(n_components=2, algorithm="tsqr",
                           random_state=rng).fit(dX)
    rpca = sd.TruncatedSVD(n_components=2,
                           algorithm="arpack",
                           random_state=rng).fit(X)
    assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 12)

    # Compare to the Frobenius norm
    X_apca = apca.transform(X)
    X_rpca = rpca.transform(X)
    assert_array_almost_equal(np.sum(apca.singular_values_**2.0),
                              np.linalg.norm(X_apca, "fro")**2.0, 12)
    assert_array_almost_equal(np.sum(rpca.singular_values_**2.0),
                              np.linalg.norm(X_rpca, "fro")**2.0, 12)

    # Compare to the 2-norms of the score vectors
    assert_array_almost_equal(apca.singular_values_,
                              np.sqrt(np.sum(X_apca**2.0, axis=0)), 12)
    assert_array_almost_equal(rpca.singular_values_,
                              np.sqrt(np.sum(X_rpca**2.0, axis=0)), 12)

    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = rng.randn(n_samples, n_features)
    dX = da.from_array(X, chunks=(50, n_features))

    apca = dd.TruncatedSVD(n_components=3,
                           algorithm="randomized",
                           random_state=0)
    rpca = sd.TruncatedSVD(n_components=3,
                           algorithm="randomized",
                           random_state=0)
    X_apca = apca.fit_transform(dX).compute()
    X_rpca = rpca.fit_transform(X)

    X_apca /= np.sqrt(np.sum(X_apca**2.0, axis=0))
    X_rpca /= np.sqrt(np.sum(X_rpca**2.0, axis=0))
    X_apca[:, 0] *= 3.142
    X_apca[:, 1] *= 2.718
    X_rpca[:, 0] *= 3.142
    X_rpca[:, 1] *= 2.718

    X_hat_apca = np.dot(X_apca, apca.components_)
    X_hat_rpca = np.dot(X_rpca, rpca.components_)
    apca.fit(da.from_array(X_hat_apca, chunks=(50, n_features)))
    rpca.fit(X_hat_rpca)
    assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14)
    assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14)
Exemplo n.º 15
0
def find_components(input_data, n_components=2, method='pca', **kwargs):
    '''
    Extract components from an array of data
    
    input_data: np.array
        The input data matrix
    
    n_comps : int
        The number of components to extract
    
    method : str
        The dimensionality reduction technique to use
        
    kwargs : optional arguments to pass to the construction of the estimator
        
    Note: this function is basically a wrapper for a bunch of the 
    standard estimators found in sklearn. Please refer to the sklearn documentation
    for the keyword arguments to pass to the various estimators.
    http://scikit-learn.org/stable/modules/decomposition.html
    
    Examples
    --------
    
    >>>components = find_components(data,method='k-means',tol=1e-3, batch_size=100, max_iter=50)
    >>>plot(components.T)
    
    >>>components = find_components(copy(all_traces.T),method='incremental pca',batch_size=100)
    >>>plot(components.T)
    
    DEV:
     Automatically compute the batch sizes for incremental and minibatch PCA
    "The computational overhead of each SVD is O(batch_size * n_features ** 2), 
    but only 2 * batch_size samples remain in memory at a time. There will be n_samples / batch_size SVD 
    computations to get the principal components, versus 1 large SVD of complexity O(n_samples * n_features ** 2) 
    for PCA."
    
    Issues
    ------

    LDA returns components that are not orthogonal; need to apply Gram-Schmidt
    
    Kernel PCA is currently broken, also LDA is erratic
    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html#sklearn.decomposition.KernelPCA
    

    '''

    if not has_sklearn:
        warnings.warn(
            'scikit-learn not found. This function will not work correctly.')
        return None

    n_samples, n_features = input_data.shape
    rng = RandomState(0)

    if n_samples < n_features:
        warnings.warn(
            'More features than samples; assuming input data matrix was transposed.'
        )
        input_data = input_data.T
        n_samples, n_features = n_features, n_samples

    if method in [
            'pca', 'ica', 'k-means', 'incremental pca', 'kernel pca',
            'random pca'
    ]:
        data = input_data - input_data.mean(axis=0)
    else:
        data = input_data

    if method == 'pca':
        estimator = decomposition.PCA(n_components=n_components, **kwargs)
    elif method == 'k-means':
        estimator = MiniBatchKMeans(n_clusters=n_components,
                                    random_state=rng,
                                    **kwargs)
    elif method == 'ica':
        estimator = decomposition.FastICA(n_components=n_components,
                                          whiten=True,
                                          **kwargs)
    elif method == 'incremental pca':
        estimator = decomposition.IncrementalPCA(n_components=n_components,
                                                 whiten=True)
    elif method == 'svd':
        estimator = decomposition.TruncatedSVD(n_components=n_components,
                                               random_state=rng,
                                               **kwargs)
    elif method == 'kernel pca':
        estimator = decomposition.KernelPCA(n_components=n_components,
                                            **kwargs)
    elif method == 'lda':
        estimator = decomposition.LatentDirichletAllocation(
            n_topics=n_components, random_state=rng, **kwargs)
        data = data + abs(min(ravel(data)))
    elif method == 'random pca':
        estimator = decomposition.RandomizedPCA(n_components=n_components,
                                                whiten=True,
                                                **kwargs)
    else:
        warnings.warn('Unknown \'method\' argument given; falling back to PCA')
        estimator = decomposition.PCA(n_components=n_components)

    estimator.fit(data)

    if hasattr(estimator, 'cluster_centers_'):
        components = estimator.cluster_centers_
    else:
        components = estimator.components_

    return components
Exemplo n.º 16
0
print('Pipeline...')
fp = pipeline.Pipeline([(
    'union',
    pipeline.FeatureUnion(
        n_jobs=-1,
        transformer_list=[
            ('standard', cust_regression_vals()),
            ('pi1',
             pipeline.Pipeline([
                 ('Gene', cust_txt_col('Gene')),
                 ('count_Gene',
                  feature_extraction.text.CountVectorizer(analyzer=u'char',
                                                          ngram_range=(1, 8))),
                 ('tsvd1',
                  decomposition.TruncatedSVD(n_components=20,
                                             n_iter=30,
                                             random_state=12))
             ])),
            ('pi2',
             pipeline.Pipeline([
                 ('Variation', cust_txt_col('Variation')),
                 ('count_Variation',
                  feature_extraction.text.CountVectorizer(analyzer=u'char',
                                                          ngram_range=(1, 8))),
                 ('tsvd2',
                  decomposition.TruncatedSVD(n_components=20,
                                             n_iter=30,
                                             random_state=12))
             ])),
            # commented for Kaggle Limits
            ('pi3',
Exemplo n.º 17
0
 def as_local(self):
     target = sk_dec.TruncatedSVD(self.n_components, algorithm=self.algorithm, n_iter=self.n_iter,
                                  random_state=self.random_state, tol=self.tol)
     copy_attrs_as_local(self, target, 'components_', 'explained_variance_',
                         'explained_variance_ratio_', 'singular_values_')
     return target
    def relationPrediction(self, inputDir):
        print("Model may take atleast 20 mins for predictions..")
        # Read the user-like Data..
        relationDataFrame = pd.read_csv("/data/training/relation/relation.csv")

        # Get Count of No.of likes to each page..
        counts = relationDataFrame['like_id'].value_counts()

        # Filter out the least liked and Most liked pages..
        filteredFrame = relationDataFrame[relationDataFrame['like_id'].isin(
            counts[counts > 25].index)]
        filteredFrame = filteredFrame[filteredFrame['like_id'].isin(
            counts[counts < 1000].index)]

        # Read the test data..
        relationTestDataFrame = pd.read_csv(inputDir +
                                            "/relation/relation.csv")

        # Get unique like id's from test data..
        uniqueLikeIds = relationTestDataFrame.like_id.unique()

        filteredFrame = filteredFrame[filteredFrame.like_id.isin(
            uniqueLikeIds)]

        tempTestDataFrame = relationTestDataFrame[
            relationTestDataFrame.like_id.isin(filteredFrame.like_id.unique())]

        uselessTestUsers = relationTestDataFrame[~relationTestDataFrame.userid.
                                                 isin(tempTestDataFrame.userid.
                                                      unique())]
        print(len(uselessTestUsers.userid.unique()))
        uselessTestUsers = uselessTestUsers[~uselessTestUsers.duplicated(
            subset='userid')]
        tempTestDataFrame = tempTestDataFrame.append(uselessTestUsers)

        relationTestDataFrame = tempTestDataFrame

        # append test data to train data..
        filteredFrame = filteredFrame.append(relationTestDataFrame)
        print(filteredFrame.shape)

        #  Get a user-like Cross Matrix..
        crossedTable = pd.crosstab(filteredFrame['userid'],
                                   filteredFrame['like_id'])
        crossedFrameWithIndex = pd.DataFrame(crossedTable.to_records())

        # Apply PCA to the data..
        X = crossedFrameWithIndex.ix[:, 1:]
        pca = decomposition.TruncatedSVD(n_components=100)
        X = pca.fit_transform(X)

        pcaedDataFrame = pd.DataFrame(X,
                                      index=crossedFrameWithIndex.ix[:,
                                                                     'userid'])
        pcaedDataFrame.reset_index(level=0, inplace=True)

        # Seperate test data from training data..
        trainDataFrame = pcaedDataFrame[
            ~pcaedDataFrame.userid.isin(relationTestDataFrame.userid.unique())]
        testDataFrame = pcaedDataFrame[pcaedDataFrame.userid.isin(
            relationTestDataFrame.userid.unique())]

        # Merge the test userid's and training userid's with their Profile..
        trainProfileDataFrame = pd.read_csv(
            "/data/training/profile/profile.csv")
        testProfileDataFrame = pd.read_csv(inputDir + "/profile/profile.csv")

        trainDataFrame = pd.merge(trainDataFrame,
                                  trainProfileDataFrame,
                                  on='userid')
        testDataFrame = pd.merge(testDataFrame,
                                 testProfileDataFrame,
                                 on='userid')

        gender = RelationPredictor.predictGender(self,
                                                 trainingData=trainDataFrame,
                                                 testingData=testDataFrame,
                                                 doKFold=False)
        age = RelationPredictor.predictAge(self,
                                           trainingData=trainDataFrame,
                                           testingData=testDataFrame,
                                           doKFold=False)
        ope, con, ext, agr, neu = RelationPredictor.predictPersonalityScores(
            self,
            trainingData=trainDataFrame,
            testingData=testDataFrame,
            doKFold=False)

        finDataFrame = pd.DataFrame({
            'userid': testDataFrame['userid'],
            'gender': gender,
            'age': age,
            'ope': ope,
            'con': con,
            'ext': ext,
            'agr': agr,
            'neu': neu
        })
        # print("Gender Acc: " + str(accuracy_score(testDataFrame.ix[:,'gender'], finDataFrame['gender'])))
        # testingAge = pd.cut(testDataFrame.ix[:,'age'], [0.0, 24.0, 34.0, 49.0, 1000.0], labels=[0, 1, 2, 3], retbins=False, include_lowest=True)
        # print("Age Acc: " + str(accuracy_score(testingAge, finDataFrame['age'])))
        # print("Ope RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'ope'], finDataFrame['ope']))))
        # print("Con RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'con'], finDataFrame['con']))))
        # print("Ext RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'ext'], finDataFrame['ext']))))
        # print("Agr RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'agr'], finDataFrame['agr']))))
        # print("Neu RMSE: " + str(sqrt(mean_squared_error(testDataFrame.ix[:,'neu'], finDataFrame['neu']))))
        return finDataFrame
Exemplo n.º 19
0
 def perform(self):
     lsa = skd.TruncatedSVD(n_components=self.n_components, random_state=self.random_state)
     transform = lsa.fit_transform(self.data)
     return transform
Exemplo n.º 20
0
                      ngram_range=(1, 3),
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1,
                      stop_words='english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) +
        list(xtest))  # Learn vocabulary and idf from training set.
# list of -> (# of sentence, occurred words number    tf-idf-score)
xtrain_tfv = tfv.transform(xtrain)  # create sparse matrix with tf-idf probs
xvalid_tfv = tfv.transform(xtest)

# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
print("SVM + TFIDF")
svd = decomposition.TruncatedSVD(
    n_components=200)  # can we test with more features?! up to 200...
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

# Fitting a simple SVM
clf = SVC(C=1.0, probability=True)  # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)
predictions_classes = clf.predict(xvalid_svd_scl)
Exemplo n.º 21
0
from sklearn.externals import joblib
from sklearn import decomposition

matrix_ppmi = joblib.load('matrix_ppmi')
svd = decomposition.TruncatedSVD(300)
matrix_300 = svd.fit_transform(matrix_ppmi)
joblib.dump(matrix_300, 'matrix_300')
Exemplo n.º 22
0
def run_category():  # 构建分类模型
    t = time.time()
    # 读取原始数据
    data = pd.read_excel(os.path.join(root_dir, r'全部数据/附件2.xlsx'))

    data['留言'] = (2 * data['留言主题'] + data['留言详情']).apply(
        lambda i: process(i))  # 创建新的属性并对属性值执行process函数

    data['text_split_list'] = data['留言'].apply(
        lambda i: jieba.lcut(i, cut_all=True))  # 使用全模式对文本进行切分

    data['text_split'] = [' '.join(i) for i in data['text_split_list']]

    data.head()  # 查看预处理后的数据

    lbl_enc = preprocessing.LabelEncoder()  # 将文本标签(Text Label)转化为数字(Integer)
    y = lbl_enc.fit_transform(data.一级标签.values)

    # 将DataFrame以9:1切分为训练集和验证集
    data_train, data_valid, ytrain, yvalid = train_test_split(data,
                                                              y,
                                                              stratify=y,
                                                              random_state=42,
                                                              test_size=0.1,
                                                              shuffle=True)

    xtrain = data_train.text_split.values  # 将训练集的text_split属性值保存在xtrain中
    xvalid = data_valid.text_split.values  # 将验证集的text_split属性值保存在ytrain中

    # 读取停用词文件,并将停用词保存到列表中
    stwlist = [
        line.strip() for line in open(os.path.join(root_dir, r'stopword.txt'),
                                      'r',
                                      encoding='gbk').readlines()
    ]

    # 使用TF-IDF算法将文本转化为词频矩阵
    tfv = TfidfVectorizer(min_df=3,
                          max_df=0.5,
                          max_features=None,
                          ngram_range=(1, 2),
                          use_idf=True,
                          smooth_idf=True,
                          stop_words=stwlist)

    # 使用TF-IDF来fit训练集和测试集
    tfv.fit(list(xtrain) + list(xvalid))
    xtrain_tfv = tfv.transform(xtrain)
    xvalid_tfv = tfv.transform(xvalid)

    # 使用SVD进行降维,components设为150,对于SVM来说,SVD的components的合适调整区间一般为120~200
    svd = decomposition.TruncatedSVD(n_components=150, random_state=42)
    svd.fit(xtrain_tfv)
    xtrain_svd = svd.transform(xtrain_tfv)
    xvalid_svd = svd.transform(xvalid_tfv)

    # 对从SVD获得的数据进行缩放
    scl = preprocessing.StandardScaler()
    scl.fit(xtrain_svd)
    xtrain_svd_scl = scl.transform(xtrain_svd)
    xvalid_svd_scl = scl.transform(xvalid_svd)

    # #使用网格搜索法寻找支持向量机的最佳超参数
    # from sklearn.model_selection import GridSearchCV
    # # 把要调整的参数以及其候选值 列出来;
    # param_grid = {"gamma": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    #               "C": [0.001, 0.01, 0.1, 1, 10, 100]}
    # print("Parameters:{}".format(param_grid))
    #
    # clf = SVC(random_state=11)  # 实例化一个SVC类
    #
    # grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='f1_micro')  # 实例化一个GridSearchCV类
    # grid_search.fit(xtrain_svd_scl, ytrain)  # 训练,找到最优的参数,同时使用最优的参数实例化一个新的SVC estimator。
    # print("Test set score:{:.2f}".format(grid_search.score(xvalid_svd_scl, yvalid)))
    # print("Best parameters:{}".format(grid_search.best_params_))
    # print("Best score on train set:{:.2f}".format(grid_search.best_score_))

    # 调用SVM模型
    clf = SVC(C=10, gamma=0.001, random_state=151)
    clf.fit(xtrain_svd_scl, ytrain)  # 输入训练集数据
    predictions = clf.predict(xvalid_svd_scl)  # 输入需要被分类的验证集数据
    print("模型的f1_score: %0.3f " %
          f1_score(yvalid, predictions, average='micro'))
    model_path = os.path.join(root_dir, "结果数据/留言分类模型.m")
    joblib.dump(clf, model_path)

    # # 读取模型
    # clf = joblib.load("my_model.m")

    # 保存验证集的分类结果
    result = data_valid.copy()
    result.drop(['留言', 'text_split_list', 'text_split'], axis=1, inplace=True)
    result['模型结果'] = lbl_enc.inverse_transform(predictions)

    result_path = os.path.join(root_dir, r'结果数据/分类结果.xlsx')
    result.to_excel(result_path, index=False)
    print(result.head(5))
    print("训练结束,用时:%f" % (time.time() - t))
    print("第一题分类模型构建完毕, 模型保存在%s, 分类结果保存在%s" % (model_path, result_path))
Exemplo n.º 23
0
    def factors_em(self, max_iter=50, tol=math.sqrt(0.000001)):
        """
        Estimates factors with EM alogorithm to handle missings
        Inputs: 
        max_iter: Maximum number of iterations
        tol: Tolerance for convergence between iterations of predicted series values
        Alogrithm:
        1) initial_nas: Boolean mask of locations of NaNs
        2) working_data: Create Standardized data matrix with nan's replaced with means
        3) F: Preliminary factor estimates
        4) data_hat_last: Predicted standardized values of last SVD model. data_hat and data_hat_last will not exactly be mean 0 variance 1
        5) Iterate data_hat until convergence
        6) Fill in nans from orginal data
        Saves
        1) self.svdmodel: sklearn pipeline with standardization step and svd model
        2) self.series_filled: self.series with any NaNs filled in with predicted values from self.svdmodel
        """
        # Define our estimation pipelines
        pipe = skpipe.Pipeline([
            ('Standardize',
             self.factor_standardizer_method(self.standard_method)),
            ('Factors', skd.TruncatedSVD(self.Nfactor, algorithm='arpack'))
        ])
        inital_scalar = self.factor_standardizer_method(self.standard_method)

        # Make numpy arrays for calculations
        actual_data = self.series.to_numpy(copy=True)
        intial_nas = self.series.isna().to_numpy(copy=True)
        working_data = inital_scalar.fit_transform(
            self.series.fillna(value=self.series.mean(),
                               axis='index').to_numpy(copy=True))

        # Estimate initial model
        F = pipe.fit_transform(working_data)
        data_hat_last = pipe.inverse_transform(F)

        # Iterate until model convereges
        iter = 0
        distance = tol + 1
        while (iter < max_iter) and (distance > tol):
            F = pipe.fit_transform(working_data)
            data_hat = pipe.inverse_transform(F)
            distance = np.linalg.norm(data_hat - data_hat_last,
                                      2) / np.linalg.norm(data_hat_last, 2)
            data_hat_last = data_hat.copy()
            working_data[intial_nas] = data_hat[intial_nas]
            iter += 1

        # Print results
        if iter == max_iter:
            print(
                f"EM alogrithm failed to converge afet Maximum iterations of {max_iter}. Distance = {distance}, tolerance was {tol}"
            )
        else:
            print(f"EM algorithm converged after {iter} iterations")

        # Save Results
        actual_data[intial_nas] = inital_scalar.inverse_transform(
            working_data)[intial_nas]
        self.svdmodel = pipe
        self.series_filled = pd.DataFrame(actual_data,
                                          index=self.series.index,
                                          columns=self.series.columns)
        self.factors = pd.DataFrame(
            F,
            index=self.series_filled.index,
            columns=[f"F{i}" for i in range(1, F.shape[1] + 1)])
Exemplo n.º 24
0
 def __init__(self, **kwargs):
     self.truncated_svd = decomp.TruncatedSVD(**kwargs)
Exemplo n.º 25
0
        wordBucket[key] = plc
        plc += 1
    print('the place is: ' + str(plc))

    outVec = scipy.sparse.lil_matrix((len(globalDat), len(wordBucket.keys())))
    print(str(outVec.shape))
    for descN in range(len(text)):
        for importance, word in text[descN]:
            if wordBucket.get(word) != None:
                outVec[descN, wordBucket[word]] = importance
    #wordVecer = txt.CountVectorizer()
    #outVec = wordVecer.fit_transform(text)
    #cut down on insane amount of text features
    #decomp = dec.PCA(n_components=40000)
    print('starting truncation')
    decomp = dec.TruncatedSVD(n_components=int(outVec.shape[1] / 3))
    outVec = decomp.fit_transform(outVec)
    outVec = scipy.sparse.csr_matrix(outVec, dtype=float)
    #trunc = dec.TruncatedSVD(n_components=outVec.shape[1]/10)
    #trunc.fit(outVec)
    #outVec = trunc.transform(outVec)
    print('description vector shape: ' + str(outVec.shape) + " " +
          str(type(outVec)))
    #collect the target

    #put all information in usable format
    shp = (len(formatted), len(formatted[0]) + 1 + outVec.shape[1])
    withTarget = scipy.sparse.lil_matrix(shp, dtype=float)

    print("WithTarget: " + str(withTarget.shape))
    withTarget = withTarget.transpose()
Exemplo n.º 26
0
def pca():
  print("PCA projection is selected")
  embedder = decomposition.TruncatedSVD(n_components=n_components)
  return embedder
def run(fold):

    #load the full training data with folds
    df = pd.read_csv("train_folds.csv")

    #all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]

    #fill all NaN values with NONE
    #note that I am converting all columns to "strings"
    #it doesnt matter because all are categories

    for col in features:

        df.loc[:, col] = df[col].astype(str).fillna("NONE")

    #get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)

    #get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    #initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()

    #fit ohe on training + validation features

    full_data = pd.concat([df_train[features], df_valid[features]], axis=0)

    ohe.fit(full_data[features])

    #transform training data
    x_train = ohe.transform(df_train[features])

    #transform validation data

    x_valid = ohe.transform(df_valid[features])

    #initialize Truncated SVD
    #we are reducing the data to 120 components

    svd = decomposition.TruncatedSVD(n_components=120)

    #fit svd on full sparse training data
    full_sparse = sparse.vstack((x_train, x_valid))
    svd.fit(full_sparse)

    #transform sparse training data
    x_train = svd.transform(x_train)

    x_valid = svd.transform(x_valid)

    model = ensemble.RandomForestClassifier(n_jobs=-1)

    #transform sparse validation data
    #fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)

    #predict on validation data
    #we need the probability values as we are calculating AUC
    #we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]

    #get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)

    #print auc
    print(f"Fold = {fold}, AUC = {auc}")
Exemplo n.º 28
0
from sklearn.neighbors import LocalOutlierFactor
import networkx as nx

algorithms = [
    decomposition.TruncatedSVD, manifold.MDS, manifold.Isomap,
    manifold.LocallyLinearEmbedding, manifold.TSNE
]

fname = sys.argv[1]
algorithm = int(sys.argv[2])
n_comps = int(sys.argv[3])

x = np.loadtxt(fname)

if algorithm == 0:
    model = decomposition.TruncatedSVD(n_components=n_comps)
    X = model.fit_transform(x)
elif algorithm == 1:
    model = manifold.MDS(n_components=n_comps)
    X = model.fit_transform(x)
elif algorithm == 2:
    model = manifold.Isomap(n_components=n_comps)
    X = model.fit_transform(x)
elif algorithm == 3:
    model = manifold.TSNE(n_components=n_comps)
    X = model.fit_transform(x)
elif algorithm == 4:
    n_points, input_size = x.shape
    som_size = int(np.sqrt(n_points) / 2)
    model = MiniSom(som_size,
                    som_size,
Exemplo n.º 29
0
    if title is not None:
        plt.title(title)


# ========================================================================================
# 随机映射
print("Computing random projection")
rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
X_projected = rp.fit_transform(X)
plot_embedding_2d(X_projected, "Random Projection")

# ========================================================================================
# PCA
print("Computing PCA projection")
t0 = time.time()
X_pca = decomposition.TruncatedSVD(n_components=3).fit_transform(X)
plot_embedding_2d(X_pca[:, 0:2], "PCA 2D")
plot_embedding_3d(X_pca, "PCA 3D (time %.2fs)" % (time.time() - t0))

# ========================================================================================
# LDA
print("Computing LDA projection")
X2 = X.copy()
X2.flat[::X.shape[1] + 1] += 0.01  # Make X invertible
t0 = time.time()
lda = LinearDiscriminantAnalysis(n_components=3)
X_lda = lda.fit_transform(X2, y)
plot_embedding_2d(X_lda[:, 0:2], "LDA 2D")
plot_embedding_3d(X_lda, "LDA 3D (time %.2fs)" % (time.time() - t0))

# ========================================================================================
Exemplo n.º 30
0
def plot_2d_per_time():
    fig_style = dict(cmap="viridis", marker='o')
    df, _ = models.load_unigrams()
    df_ratios = models.load_ratios()

    ug_normalized = (np.log1p(df).T / np.log1p(df).sum(axis=1)).T
    ug_smoothed = ug_normalized.rolling(
        data_config.unigram_rm_smoothing).mean()

    d_svd_unsmooth = pre.scale(
        decomp.TruncatedSVD(n_components=2).fit_transform(
            pre.StandardScaler().fit_transform(
                ug_normalized.ix[data_config.date_begin:].as_matrix())))

    d_svd_smooth = pre.scale(
        decomp.TruncatedSVD(n_components=2).fit_transform(
            pre.StandardScaler().fit_transform(
                ug_smoothed.ix[data_config.date_begin:].as_matrix())))

    ratios_smoothed = df_ratios.ewm(com=21).mean()
    d_ratios_ica = pre.scale(
        decomp.FastICA(n_components=2).fit_transform(
            pre.scale(
                ratios_smoothed.ix[data_config.date_begin:].as_matrix())))
    d_ratios_ica_restricted = pre.scale(
        decomp.FastICA(n_components=2).fit_transform(pre.StandardScaler(
        ).fit_transform(
            ratios_smoothed.ix[data_config.date_turning_point:].as_matrix())))

    d_ratios_ica_rotated = d_ratios_ica.dot(
        scipy.linalg.orthogonal_procrustes(d_ratios_ica, d_svd_smooth)[0])
    d_ratios_ica_restricted_rotated = d_ratios_ica_restricted.dot(
        scipy.linalg.orthogonal_procrustes(
            d_ratios_ica_restricted,
            d_svd_smooth[-d_ratios_ica_restricted.shape[0]:])[0])

    fig, ax = plt.subplots(ncols=3)
    idx = [d.toordinal() for d in df.ix[data_config.date_begin:].index.date]
    scatter_svd_unsmooth = ax[0].scatter(*d_svd_unsmooth.T,
                                         c=idx,
                                         s=15,
                                         **fig_style)
    ax[1].scatter(*d_svd_smooth.T, c=idx, s=15, **fig_style)
    scatter_ica_restricted = ax[2].scatter(
        *d_ratios_ica_restricted_rotated.T,
        c=idx[-d_ratios_ica_restricted.shape[0]:],
        s=15,
        vmin=min(idx),
        vmax=max(idx),
        **fig_style)

    for a in ax:
        a.set_frame_on(False)
        a.get_xaxis().set_visible(False)
        a.get_yaxis().set_visible(False)

    cb = fig.colorbar(scatter_svd_unsmooth,
                      orientation='vertical',
                      ticks=dates.YearLocator(),
                      format=dates.DateFormatter('%Y'))
    cb.outline.set_visible(False)
    fig.savefig("thesis/plots/unigram_decomp.png")