def check_classifiers_classes(name, Classifier):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_fast_parameters(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))
Пример #2
0
def check_classifiers_classes(name, Classifier):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_fast_parameters(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))
Пример #3
0
 def numpy_normalize(self):
     '''By using numpy's implementation of std, memory consumption can be reduced by half'''
     std = self.X.std()
     mean = self.X.mean()
     scaler = StandardScaler(copy=False)
     scaler.std = std
     scaler.mean = mean
     self.X_normalized = scaler.fit_transform(self.X)
     print("Data normalized with numpy")
Пример #4
0
def calc_pca(data, n_comps=2, standardize=False):
    if standardize:
        data = StandardScaler().fit_transform(data)

    data = data.mean(axis=0) - data
    cov_mat = np.cov(data, rowvar=False)
    evals, evecs = np.linalg.eig(cov_mat)

    idx = np.argsort(evals)[::-1]
    evecs = evecs[:, idx]
    evals = evals[idx]
    evecs = evecs[:, :n_comps]

    return np.dot(data, evecs), evals, evecs
Пример #5
0
def qq_plot(df):
    # not working for some reasom. No time to find out
    price = df.price[(df.price <= 2000) & (df.price > 500)]
    price_log = np.log(price)
    price_mm = MinMaxScaler().fit_transform(
        price.values.reshape(-1, 1).astype(np.float64)).flatten()
    price_z = StandardScaler().fit_transform(
        price.values.reshape(-1, 1).astype(np.float64)).flatten()

    sm.qqplot(price_log, loc=price_log.mean(),
              scale=price_log.std()).savefig('qq_price_log.png')
    sm.qqplot(price_mm, loc=price_mm.mean(),
              scale=price_mm.std()).savefig('qq_price_mm.png')
    sm.qqplot(price_z, loc=price_z.mean(),
              scale=price_z.std()).savefig('qq_price_z.png')

    return
Пример #6
0
def standardize(array, name):
    """Recieves a dataFrame or Series (from pandas) and returns a numpy array with zero mean and unit variance."""
    # Transform to numpy array
    nparray = array.as_matrix().reshape(array.shape[0],1).astype('float32')
    print('------------')
    print(name)
    print('Different values before:', np.unique(nparray).shape[0])

    # Standardize the data
    nparray = StandardScaler().fit_transform(nparray)

    # Print some information
    print('Mean:', nparray.mean())
    print('Max:', nparray.max())
    print('Min:', nparray.min())
    print('Std:', nparray.std())
    print('Different values after:', np.unique(nparray).shape[0])

    return nparray
Пример #7
0
def standardize_dataframe(df):
    """
    In order to perform a principal component analysis on the data, you first need to standardize it. The goal here
    is to have column means at 0 and standard deviation at 1.

    Input: DataFrame
    Returns: Standardized DataFrame
    """

    features = []
    for i in range(1, 55):
        features.append(f"Atr{i}")

    x_ = df.loc[:, features].values
    y = df.loc[:, ['Class']].values
    x = StandardScaler().fit_transform(x_)

    # check that column means are 0, standard deviation of 1
    print(x.mean(axis=0))
    print(x.std(axis=0, ddof=1))

    return x
Пример #8
0
# In[]

#Matriz de covarianza, correlaciones, gráfica de dependencia líneal y número de condición
cov_df = df_num_norm.cov()
var_global = sum(np.diag(cov_df))
det = np.linalg.det(cov_df)
corr_df = df_num_norm.corr()
sns.heatmap(corr_df, center=0, cmap='Blues_r')
cond_cov = np.linalg.cond(cov_df)

# In[]

#Identificación de outliers y Eliminación del 10%
#a=[]
a_rob = []
media_num_norm = np.array(df_num_norm.mean())
mediana_num_norm = np.array(df_num_norm.median())
inv_cov = np.linalg.inv(np.array(cov_df))
for i in range(len(df_num_norm.index)):
    #b = distance.mahalanobis(np.array(df_num_norm.iloc[i,:]),media_num_norm,inv_cov)
    b_rob = distance.mahalanobis(np.array(df_num_norm.iloc[i, :]),
                                 mediana_num_norm, inv_cov)
    #a.append(b)
    a_rob.append(b_rob)

#df_num_norm['mahal_normal'] = a
df_num_norm['mahal_rob'] = a_rob

#df_v2['mahal_normal'] = a
df_v2['mahal_rob'] = a_rob
Пример #9
0
price_mm = MinMaxScaler().fit_transform(price.values.reshape(-1, 1).astype(np.float64)).flatten()
price_z = StandardScaler().fit_transform(price.values.reshape(-1, 1).astype(np.float64)).flatten()

# Q-Q plot of the initial feature

# In[ ]:


sm.qqplot(price, loc=price.mean(), scale=price.std())

# Q-Q plot after StandardScaler. Shape doesn’t change

# In[ ]:


sm.qqplot(price_z, loc=price_z.mean(), scale=price_z.std())

# Q-Q plot after MinMaxScaler. Shape doesn’t change

# In[ ]:


sm.qqplot(price_mm, loc=price_mm.mean(), scale=price_mm.std())

# Q-Q plot after taking the logarithm. Things are getting better!

# In[ ]:


sm.qqplot(price_log, loc=price_log.mean(), scale=price_log.std())
X_test = test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# In[240]:

X_train.head()

# In[241]:

X_test.head()

# In[253]:

from sklearn.linear_model import LogisticRegressionCV
m = LogisticRegressionCV()
m.fit(X_train, y_train)
print([s.mean() for s in m.scores_[1]])
y_test = m.predict(X_test)

submission = X_test.copy()
submission['Survived'] = y_test
submission.head()

submission.to_csv('submission.csv', columns=['Survived'])

# In[254]:

from sklearn.svm import SVC
m = SVC()
m.fit(X_train, y_train)
print(m.score(X_train, y_train))
y_test = m.predict(X_test)
Пример #11
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

iris = load_iris()
X = iris.data  # Xshape(150, 4)

# X的归一化
X_norm = StandardScaler().fit_transform(X)
X_norm.mean(axis=0)  # 这样每一维均值为0了

# 求特征值和特征向量
# np.cov直接求协方差矩阵,每一行代表一个特征,每一列代表样本
ew, ev = np.linalg.eig(np.cov(X_norm.T))
print(ew)
print(ev)

# 特征向量特征值的排序
ew_oreder = np.argsort(ew)[::-1]
print("ew_oreder", ew_oreder)
ew_sort = ew[ew_oreder]
print("ew_sort", ew_sort)
ev_sort = ev[:, ew_oreder]  # ev的每一列代表一个特征向量
print("ev_sort", ev_sort)
print(ev_sort.shape)  # (4,4)

# 我们指定降成2维, 然后取出排序后的特征向量的前两列就是基
K = 2
V = ev_sort[:, :2]  # 4*2
    def fit(self, X, y=None):
        """Estimate the CSP decomposition on epochs.

        Parameters
        ----------
        X : Dataframe, shape (n_epochs,n_columns) 
            dataframe with mode magnitudes at each channel corresponding window, 
            trial and label - The data on which to estimate the CSP.

        Returns
        -------
        self : instance of CSP
            Returns the modified instance.
        """
        y = X['label'].reset_index(drop=True)
        trials = X['trial'].reset_index(drop=True)
        X = X.values[:, :-4]
        X = StandardScaler().fit_transform(X)
        if not isinstance(X, np.ndarray):
            raise ValueError("X should be of type ndarray (got %s)." % type(X))
        self._check_Xy(X, y)
        n_channels = X.shape[1]

        self._classes = np.unique(y)
        n_classes = len(self._classes)
        if n_classes < 2:
            raise ValueError("n_classes must be >= 2.")

        covs = np.zeros((n_classes, n_channels, n_channels))
        sample_weights = list()
        for class_idx, this_class in enumerate(self._classes):
            if self.cov_est == "concat":  # concatenate epochs
                class_ = X[y == this_class].T
                cov = _regularized_covariance(
                    class_,
                    reg=self.reg,
                    method_params=self.cov_method_params,
                    rank=self.rank)
                weight = sum(y == this_class)
            elif self.cov_est == "epoch":
                class_ = X[y == this_class]
                cov = np.zeros((n_channels, n_channels))
                for this_X in class_:
                    cov += _regularized_covariance(
                        this_X,
                        reg=self.reg,
                        method_params=self.cov_method_params,
                        rank=self.rank)
                cov /= len(class_)
                weight = len(class_)

            covs[class_idx] = cov
            if self.norm_trace:
                # Append covariance matrix and weight. Prior to version 0.15,
                # trace normalization was applied, but was breaking results for
                # some usecases by changing the apparent ranking of patterns.
                # Trace normalization of the covariance matrix was removed
                # without signigificant effect on patterns or performances.
                # If the user interested in this feature, we suggest trace
                # normalization of the epochs prior to the CSP.
                covs[class_idx] /= np.trace(cov)

            sample_weights.append(weight)

        if n_classes == 2:
            eigen_values, eigen_vectors = linalg.eigh(covs[0], covs.sum(0))
            # sort eigenvectors
            ix = np.argsort(np.abs(eigen_values - 0.5))[::-1]
        else:
            # The multiclass case is adapted from
            # http://github.com/alexandrebarachant/pyRiemann
            eigen_vectors, D = _ajd_pham(covs)

            # Here we apply an euclidean mean. See pyRiemann for other metrics
            mean_cov = np.average(covs, axis=0, weights=sample_weights)
            eigen_vectors = eigen_vectors.T

            # normalize
            for ii in range(eigen_vectors.shape[1]):
                tmp = np.dot(np.dot(eigen_vectors[:, ii].T, mean_cov),
                             eigen_vectors[:, ii])
                eigen_vectors[:, ii] /= np.sqrt(tmp)

            # class probability
            class_probas = [np.mean(y == _class) for _class in self._classes]

            # mutual information
            mutual_info = []
            for jj in range(eigen_vectors.shape[1]):
                aa, bb = 0, 0
                for (cov, prob) in zip(covs, class_probas):
                    tmp = np.dot(np.dot(eigen_vectors[:, jj].T, cov),
                                 eigen_vectors[:, jj])
                    aa += prob * np.log(np.sqrt(tmp))
                    bb += prob * (tmp**2 - 1)
                mi = -(aa + (3.0 / 16) * (bb**2))
                mutual_info.append(mi)
            ix = np.argsort(mutual_info)[::-1]

        # sort eigenvectors
        eigen_vectors = eigen_vectors[:, ix]

        self.filters_ = eigen_vectors.T
        self.patterns_ = linalg.pinv2(eigen_vectors)

        pick_filters = self.filters_[:self.n_components]
        X = np.dot(pick_filters, X.T)

        # compute features (mean band power)
        X = pd.DataFrame(X.T)
        X, y = mean_trial(X, trials, y)

        # To standardize features
        self.mean_ = X.mean(axis=0)
        self.std_ = X.std(axis=0)

        return self
Пример #13
0
def classify(filename='4_14_type4_apollo3d.txt', useFrac=1.0, trainFraction=0.5, equalClassSize=True, 
             thres=0.5, useFeatures=[0], useAll=True, batch=False, useCache=True, 
             featureSelect=False, kickType=[11], draw=False, scale=False, C=1.0, B=1.0, returnProb=False): 
  
  features, labels = parse(filename=filename, useCache=useCache, ezKickSuccess=False, 
                           kickType=kickType, ignoreSelfFailure=False, useDirectFeatures=True, 
                           nfeatures=8)
  num2Use = int(useFrac*len(features))
  features = features[:num2Use]; labels = labels[:num2Use]
  if scale:
    features = StandardScaler().fit_transform(features)
  print "features mean:", features.mean(axis=0)
  print "features std:", features.std(axis=0)
  if not useAll:
#     labels = np.random.random(features.shape[0]) < 0.5
    newFeatures = features[:, useFeatures]
#     print newFeatures[:100,:]
#     newFeatures = np.random.random((features.shape[0], 9))
  else:
    newFeatures = features
  
  if equalClassSize:
    newFeatures, labels = balanceClasses(newFeatures, labels)
  
  print "we have " + str(newFeatures.shape[0]) + " samples."
  print "we have " + str(np.sum(labels == 1)) + " positive labels"
  print "ratio: " + str(float(np.sum(labels == -1))/np.sum(labels == 1))
  print "using approximately " + str(trainFraction*100) + "% as training examples"
  
  r = np.random.random(newFeatures.shape[0]) < trainFraction; r2 = np.invert(r)
  trainingSet = newFeatures[r, :]; trainLabels = labels[r]
  testingSet = newFeatures[r2, :]; testLabels = labels[r2]
      
  if not equalClassSize:
    testingSet, testLabels = balanceClasses(testingSet, testLabels)
    clf = LogisticRegression(C=C, class_weight='auto', intercept_scaling=B, penalty='l2')
#     clf = svm.SVC(C=C, kernel='rbf', class_weight='auto', probability=returnProb)
  else:
    clf = LogisticRegression(C=C, intercept_scaling=B, penalty='l2')
#     clf = svm.SVC(C=C, kernel='rbf', class_weight='auto', probability=returnProb)
#     clf = RandomForestClassifier()
#     clf = KNeighborsClassifier(n_neighbors=15)
#   print np.arange(20)[clf2.get_support()]
#     clf = AdaBoostClassifier()
#   clf = GradientBoostingClassifier(init=LogisticRegression)
#     clf = GaussianNB()
#   clf = DecisionTreeClassifier()
  
  if featureSelect:
    rfecv = RFE(estimator=clf, step=1,  n_features_to_select=8)
#     rfecv = RFECV(estimator=clf, step=1, cv=10)
    rfecv.fit(newFeatures, labels)
    print("Optimal number of features : %d" % rfecv.n_features_)
    print rfecv.ranking_
    print np.arange(20)[rfecv.support_]
    return
  
  clf.fit(trainingSet, trainLabels)
  
  def myPredict(clf, x, thres=0.5):
    probArray = clf.predict_proba(x)[:,1]
    predictLabels = 1*(probArray > thres)
    predictLabels = 2*predictLabels - 1
    return predictLabels, probArray
  
#   d = np.reshape(np.linspace(0, 10, num=1000), (-1, 1))
# #   print d.shape
#   results = clf.predict(d)
#   for i in xrange(1000):
#     if results[i] == 1:
#       print "dist:", i*0.01
#       break
  
  if returnProb:
    predictLabels, probArray = myPredict(clf, testingSet, thres=thres)
  else:
    predictLabels = clf.predict(testingSet)
#     print "accuracy rate from classifier: " + str(clf.score(testingSet, testLabels))
    
  suffix = "" if useAll else str(features)
  
  if draw and returnProb:
    area = drawPrecisionRecallCurve(filename[:-4] + suffix, testLabels, probArray)
    roc_auc = drawROCCurve(filename[:-4] + suffix, testLabels, probArray)
  
  false_neg = false_pos = true_neg = true_pos = 0
  for i in xrange(len(predictLabels)):
    if predictLabels[i] == testLabels[i] == -1:
      true_neg += 1
    elif predictLabels[i] == testLabels[i] == 1:
      true_pos += 1
    elif predictLabels[i] == -1 and testLabels[i] == 1:
      false_neg += 1
    else:
      false_pos += 1
  good = true_neg + true_pos
  print "accuracy rate: ", good/float(len(predictLabels)), good
  print "true negative rate: ", true_neg/float(len(predictLabels)), true_neg
  print "true positive rate: ", true_pos/float(len(predictLabels)), true_pos
  print "false negative rate: ", false_neg/float(len(predictLabels)), false_neg
  print "false positive rate: ", false_pos/float(len(predictLabels)), false_pos
  precision = true_pos/float(true_pos + false_pos)
  recall = true_pos/float(true_pos + false_neg)
  print "precision: ", precision
  print "recall: ", recall
  print "f1 score: ", 2*(precision*recall)/(precision + recall)
  return good/float(len(predictLabels))
Пример #14
0
df = pd.read_csv("C:\\Users\\akroc\\Desktop\\Spotify PCA\\traingdata.csv")
print(df.head())
print(df.describe())
df.drop(['time_signature', 'mode', 'key', 'duration_ms'], axis=1, inplace=True)
print(df.head())
# Standardization is important in PCA since it is a variance maximizing exercise.
# It projects your original data onto directions which maximize the variance.
from sklearn.preprocessing import StandardScaler
x = df.values  #returns a numpy array
x_scaled = StandardScaler().fit_transform(x)

df_scaled = pd.DataFrame(x_scaled, columns=df.columns)
print(df_scaled.head())
# Calculate a PCA manually
# calculate the mean vector
mean_vector = x_scaled.mean(axis=0)
print(mean_vector)
# calculate the covariance matrix
cov_mat = np.cov((x_scaled).T)
print(cov_mat.shape)
print(cov_mat)
# calculate the eigenvectors and eigenvalues of our covariance matrix for the dataset
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
# Print the eigen vectors and corresponding eigenvalues
# in order of descending eigenvalues
for i in range(len(eig_val_cov)):
    eigvec_cov = eig_vec_cov[:, i]
    print('Eigenvector {}: \n{}'.format(i + 1, eigvec_cov))
    print('Eigenvalue {} from covariance matrix: {}'.format(
        i + 1, eig_val_cov[i]))
    print(50 * '-')
BASE = '../'
PATH = BASE + 'data/'

filehandler = open(PATH + "Audio_X.pkl", "rb")
X_ = pkl.load(filehandler)
filehandler.close()

filehandler = open(PATH + "Audio_Y.pkl", "rb")
Y_ = pkl.load(filehandler)
filehandler.close()

print('Loaded Training Set')

X_ = StandardScaler().fit_transform(X_)

print(X_.mean(axis=0))
print(X_.std(axis=0))

Y = torch.from_numpy(Y_.flatten())
X = torch.from_numpy(X_).float()

audio_model = EncodingNN()

#Defining criterion
num_epochs = 1200
learning_rate = 1e-4

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(audio_model.parameters(), lr=learning_rate)

# Train the model
Пример #16
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

np.random.seed(24)
#加载数据
iris = load_iris()
X = iris.data
X_norm = StandardScaler().fit_transform(X)
X_norm.mean(axis=0)

# 求特征值和特征向量
ew, ev = np.linalg.eig(np.cov(X_norm.T))
# 特征向量特征值的排序
ew_oreder = np.argsort(ew)[::-1]
ew_sort = ew[ew_oreder]
ev_sort = ev[:, ew_oreder]  # ev的每一列代表一个特征向量
ev_sort.shape  # (4,4)

# 我们指定降成2维, 然后取出排序后的特征向量的前两列就是基
K = 2
V = ev_sort[:, :2]  # 4*2

# 最后,我们得到降维后的数据
X_new = X_norm.dot(V)  # shape (150,2)

colors = ['red', 'black', 'orange']

plt.figure()
Пример #17
0
            positions = pd.DataFrame()
            deviations = pd.DataFrame()
            slopes = pd.DataFrame()
            for i in range(len(pair_list)):
                instrument = pair_list[i]
                shape = shape_list[i]
                if shape == 1:
                    values = ratios.loc[start:end, instrument]
                else:
                    values = (ratios.loc[start: end, instrument] * -1)\
                           + (2 * ratios.loc[start: end,  instrument].values[-1])
                pos = get_channel_mean_pos_std(values.values, win)
                positions[instrument] = pos['pos'].values.ravel()
                deviations[instrument] = pos['std'].values.ravel()
                slopes[instrument] = pos['slope'].values.ravel()
            slopes_mean[win[0]] = slopes.mean(axis=1)
            position_mean[win[0]] = positions.mean(axis=1)

        plt_index = np.arange(start, end + 1)
        position_mean.index = plt_index
        slopes_mean.index = plt_index

        position_mean.plot(colors=color_list)
        plt.plot(plt_index, np.ones(plt_index.shape[0]) * 2, color='grey')
        plt.plot(plt_index, np.ones(plt_index.shape[0]) * -2, color='grey')
        plt.plot(plt_index, np.ones(plt_index.shape[0]) * 0, color='grey')
        plt.tight_layout()
        plt.title('Mean of Currency Set Channel Positions on Mulitple Windows')
        slopes_mean.plot(colors=color_list)
        plt.plot(plt_index, np.ones(plt_index.shape[0]) * 0, color='grey')
        plt.tight_layout()
Пример #18
0
    header=0,
    names=None,
    index_col=0)

# 设置要进行聚类的字段
loan = np.array(loan_data[[
    'INA5', 'INA15', 'INP5', 'INP15', 'OUTA5', 'OUTA15', 'OUTP5', 'OUTP15'
]])

before = KMeans(n_clusters=4).fit(loan)  # 降维前
print('KMeans降维前-分类结果')
print(before.labels_)

# 归一化 准备降维
X_norm = StandardScaler().fit_transform(loan)
X_norm.mean(axis=0)  # 每一维均值为0

# 降维法1:------------------------------
# 求特征值和特征向量
ew, ev = np.linalg.eig(np.cov(X_norm.T))  # np.cov直接求协方差矩阵,每一行代表一个特征,每一轮代表样本

# 特征向量特征值的排序
ew_oreder = np.argsort(ew)[::-1]
ew_sort = ew[ew_oreder]
ev_sort = ev[:, ew_oreder]  # ev的每一列代表一个特征向量
ev_sort.shape  # (4,4)

# 我们指定降成2维, 然后取出排序后的特征向量的前两列就是基
K = 2
V = ev_sort[:, :2]  # 4*2
Пример #19
0
df = pd.read_csv(URL)
df
df.columns
df.index = df.loc[:,'Customer Id']
df.drop('Customer Id',axis=1,inplace=True)
df.info()
df.drop('Address',axis=1,inplace=True)
df.info()

from sklearn.preprocessing import StandardScaler
X = df.values
X = np.nan_to_num(X)
X = StandardScaler().fit_transform(X)
X.std(axis=0)
X.mean(axis=0)

k_means = KMeans(init='k-means++',n_clusters=3,n_init=12)
k_means.fit(X)
labels = k_means.labels_
df['Cluster'] = labels

df.groupby('Cluster').mean().T

plt.scatter(X[:,0],X[:,3], c = labels.astype(np.float), alpha = .5)

# Hierarchical clustering

%reset -f

import numpy as np
Пример #20
0
# возьмем признак price из датасета Renthop и пофильтруем руками совсем экстремальные значения для наглядности
price = df.price[(df.price <= 20000) & (df.price > 500)]
price_log = np.log(price)

price_mm = MinMaxScaler().fit_transform(
    price.values.reshape(-1, 1).astype(np.float64)).flatten()
# много телодвижений, чтобы sklearn не сыпал warning-ами

price_z = StandardScaler().fit_transform(
    price.values.reshape(-1, 1).astype(np.float64)).flatten()
sm.qqplot(price_log, loc=price_log.mean(),
          scale=price_log.std()).savefig('qq_price_log.png')
sm.qqplot(price_mm, loc=price_mm.mean(),
          scale=price_mm.std()).savefig('qq_price_mm.png')
sm.qqplot(price_z, loc=price_z.mean(),
          scale=price_z.std()).savefig('qq_price_z.png')

# In[46]:

from demo import get_data

x_data, y_data = get_data()
x_data.head(5)

# In[47]:

x_data = x_data.values

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#     print(reducedDataSet.shape)
#
#     labels = model.labels_
#
#     print('labels')
#     print(labels)
#
#     # print('labels')
#     # print(labels)
#
#     # sil = metrics.silhouette_score(X, labels, metric='euclidian', sample_size=5000)

from sklearn.decomposition import PCA
import numpy as np


pca = cluster.FeatureAgglomeration(n_clusters=2)
pca.fit(X)

U, S, VT = np.linalg.svd(X - X.mean(0))

X_train_pca = pca.transform(X)

X_train_pca2 = (X - pca.mean_).dot(pca.components_.T)

X_projected = pca.inverse_transform(X_train_pca)
X_projected2 = X_train_pca.dot(pca.components_) + pca.mean_

loss = ((X - X_projected) ** 2).mean()

print(loss)
Пример #22
0
from random import random

from sklearn import cluster
from sklearn.preprocessing import StandardScaler
import numpy as np

X = np.array([[1.1], [0.9], [2.1]])
X = np.array([[random() * 6] for _ in range(1000)])

# remove mean, set variance=1
XS = StandardScaler().fit_transform(
    X)  # computes mean&std and transforms data to (0,1)
print(XS.mean())  # ~0
print(XS.std())  # ~1

# print(XS)
# x = X[:, 0]
# print(x)  # numpy flat array, ~ [1,1,2]
Пример #23
0
ratings  ##### 99 and 99.0 have been replaced with 0 in 10000 rows and 150 columns
# Visualize the ratings for joke # 148
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 12))
plt.hist(ratings[148], bins=5)
plt.xlabel('Rating')
plt.ylabel('Number of ratings')
plt.suptitle('Joke- Ratings/Num of ratings')
#Lets normalize all these ratings using StandardScaler  and save them in ratings_diff variable
from sklearn.preprocessing import StandardScaler
ratings_diff = StandardScaler().fit_transform(ratings)
ratings_diff
# Using the popularity based recommendation system find the jokes that will be highly recommended
#Find the mean for which column in ratings_diff i.e for each joke
#Here each row represents a joke and the columns are different entities who have rated this joke
mean_ratings = ratings_diff.mean(axis=0)
mean_ratings
#Consider all the mean ratings and find the jokes with the highest mean value and display the top 10 joke IDs
#First create a dataframe
mean_ratings = pd.DataFrame(mean_ratings)
mean_ratings.iloc[:, 0]
mean_ratings.iloc[:, 0].argsort()[:-20:-1]
mean_ratings.plot()
x = ratings.iloc[1:4, :-100]
##jokeCorr = joke.corrwith(joke[50])
from sklearn.metrics.pairwise import cosine_similarity
df1 = ratings.iloc[:100]
df2 = ratings.iloc[100:200]
x = df1.iloc[1]
cs1 = cosine_similarity(x.values.reshape(1, -1), df1)
cs2 = cosine_similarity(x.values.reshape(1, -1), df2)
Пример #24
0
    # Plot 1 - every currency, all indicators
    # -------------------------------------------------------------------------

    # Call Subplots
    fig, ax = plt.subplots(5, 1, figsize=(10, 10), sharex=True)

    # Plot currencies
    df_scaled.plot(ax=ax[0])
    # Plot Slopes waves
    slopes_waves.plot(ax=ax[1])
    # Plot mean position Waves
    mean_position_waves.plot(ax=ax[2])
    # Plot sum (slopes  * mean)
    location_measure.plot(figsize=(10, 3), ax=ax[3])
    location_measure.mean(axis=1).plot(ax=ax[3], color='black')
    # Plot Channel Mean Position (c3)
    channel_mean_scaled.plot(ax=ax[4])

    # get std lines on  channel position
    x = np.arange(end - interval, end + 1)
    ax[1].plot(x, np.zeros(df.shape[0]), color='black')
    ax[2].plot(x, np.ones(df.shape[0]) * indicator_std, color='black')
    ax[2].plot(x, np.ones(df.shape[0]) * -indicator_std, color='black')
    # Legends
    ax[0].legend()
    ax[1].legend()
    ax[2].legend()
    ax[3].legend()
    ax[4].legend()
    # Name Rows