예제 #1
0
    def pca_predict(self, sample_name):
        index = ["sex", "age", "state", "education",\
        "transitDuration","fulfillDuration", \
        "green","blue","black","yellow","red","white","amount"]
        self.df_user_server = pd.read_csv(self.local_path + sample_name, \
            names = ["orderid","age","sex","state","education","transitDuration","fulfillDuration","black","blue","green","yellow","red","white","amount"])        
        self.df_user_server.drop(columns=['orderid'])
        self.df_user_server = self.df_user_server[index]
        f = open(self.output_path+"user.dict", 'r')
        dic = pickle.load(f)
        f.close()
        self.df_user_server["sex"] = self.df_user_server["sex"].apply(lambda x: dic["sex"][x])
        self.df_user_server["state"] = self.df_user_server["state"].apply(lambda x: dic["state"][x])
        self.df_user_server["education"] = self.df_user_server["education"].apply(lambda x: dic["education"][x])
        
        f = open(self.output_path+"pca.model", 'r')
        self.pca = pickle.load(f)
        f.close()

        scaler = StandardScaler()
        self.pca_result = scaler.fit_transform(self.df_user_server.values)
        self.pca_result = self.pca.transform(self.pca_result)
        print self.pca_result
        self.pca_summary = vs.pca_results(self.df_user_server, self.pca, self.plot_path)

        np.savetxt(self.feature_path + sample_name, self.pca_result, delimiter=",", header="pca1,pca2", comments='')
예제 #2
0
    def pca(self):
        index = ["sex", "age", "state", "education",\
                "transitDuration","fulfillDuration", \
                "green","blue","black","yellow","red","white","amount"]

        scaler = StandardScaler()

        self.pca_result = scaler.fit_transform(self.df_user_server[index].values)

        self.pca = PCA(n_components=2)
        self.pca_result= self.pca.fit_transform(self.pca_result)

        self.pca_summary = vs.pca_results(self.df_user_server[index], self.pca, self.plot_path)
        
        f = open(self.output_path+"pca.model", 'w')
        pickle.dump(self.pca, f)
        f.close()
        f = open(self.output_path+"pca_summary.csv", 'w')
        pickle.dump(self.pca_summary, f)
        f.close()

        np.savetxt(self.feature_path + "pca.csv", self.pca_result, delimiter=",", header="pca1,pca2", comments='')
예제 #3
0
# Remove the outliers, if any were specified
good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)

# Apply PCA by fitting the good data with the same number of dimensions as features
from sklearn.decomposition import PCA

pca = PCA(n_components=6, copy=True)
pca.fit(good_data)

# Transform log_samples using the PCA fit above

pca_samples = pca.transform(log_samples)

# Generate PCA results plot
pca_results = vs.pca_results(good_data, pca)


# Display sample log-data after having a PCA transformation applied
display(pd.DataFrame(np.round(pca_samples, 4), columns = pca_results.index.values))

# Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components=2, copy=True)
pca.fit(good_data)

# Transform the good data using the PCA fit above
reduced_data = pca.transform(good_data)

# Transform log_samples using the PCA fit above
pca_samples = pca.transform(log_samples)


#Import sklearn.decomposition.PCA and assign the results of fitting PCA in six dimensions with good_data to pca.
#Apply a PCA transformation of the sample log-data log_samples using pca.transform, and assign the results to pca_samples.
from sklearn.decomposition import PCA

# Apply PCA by fitting the good data with the same number of dimensions as features
pca = PCA(n_components=6)
pca = pca.fit(good_data)

# Transform the sample log-data using the PCA fit above
pca_samples = pca.transform(good_data)

# Generate PCA results plot
pca_results = vs.pca_results(good_data, pca)




# Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components=2)
pca = pca.fit(good_data)

# Transform the good data using the PCA fit above
reduced_data = pca.transform(good_data)

# Transform the sample log-data using the PCA fit above
pca_samples = pca.transform(log_samples)

# Create a DataFrame for the reduced data
def extract_data(ticker, pca=None):

    #all these don't have anything to do with whether a PCA exist or not
    data_yahoo = pd.read_csv('Data/' + ticker + '-yahoo.csv', index_col='Date')
    data_yahoo.index = pd.to_datetime(data_yahoo.index)

    #here I'm trying to paint a shape of what's happening during the trading hours, these are independent features
    #range describes the min/max distance per open price
    data_yahoo['Range'] = (data_yahoo['High'] -
                           data_yahoo['Low']) / data_yahoo['Open']
    #high is a percentage of open
    data_yahoo['High'] = data_yahoo['High'] / data_yahoo['Open'] - 1
    #low is a percentage of open
    data_yahoo['Low'] = data_yahoo['Low'] / data_yahoo['Open'] - 1
    #open is a percentage of previous day's close
    data_yahoo['Open'] = data_yahoo['Open'] / data_yahoo['Close'].shift(1) - 1
    #previous 5 days moving average (adj close)
    data_yahoo['MA5 Adj Close'] = data_yahoo['Adj Close'].rolling(
        window=5).mean().shift(1)
    #previous 5 days moving average (volume)
    data_yahoo['MA5 Volume'] = data_yahoo['Volume'].rolling(
        window=5).mean().shift(1)
    #% change vs. previous 5 days (adj close)
    data_yahoo['MA5 Adj Close pct_change'] = data_yahoo[
        'Adj Close'] / data_yahoo['MA5 Adj Close'] - 1
    #% change vs. previous 5 days (volume)
    data_yahoo['MA5 Volume pct_change'] = data_yahoo['Volume'] / data_yahoo[
        'MA5 Volume'] - 1

    #this is what we are trying to predict (targets)
    #1. 1 day future price
    data_yahoo['Adj Close 1day'] = data_yahoo['Adj Close'].shift(-1)
    #2. 5 days future price
    data_yahoo['Adj Close 5day'] = data_yahoo['Adj Close'].shift(-5)
    #data_yahoo['Adj Close 10day'] = data_yahoo['Adj Close'].shift(-10)
    #3. 1 day future price percentage change
    data_yahoo['Adj Close 1day pct_change'] = data_yahoo[
        'Adj Close 1day'] / data_yahoo['Adj Close'] - 1
    #4. 5 day future price percentage change
    data_yahoo['Adj Close 5day pct_change'] = data_yahoo[
        'Adj Close 5day'] / data_yahoo['Adj Close'] - 1
    #data_yahoo['Adj Close 10day pct_change'] = data_yahoo['Adj Close 10day'] / data_yahoo['Adj Close'] - 1
    #5. 1 day future price direction
    data_yahoo['Adj Close 1day pct_change cls'] = data_yahoo[
        'Adj Close 1day pct_change'].apply(lambda x: 1 if x >= 0 else 0)
    #6. 5 day future price direction
    data_yahoo['Adj Close 5day pct_change cls'] = data_yahoo[
        'Adj Close 5day pct_change'].apply(lambda x: 1 if x >= 0 else 0)
    #data_yahoo['Adj Close 10day pct_change cls'] = data_yahoo['Adj Close 10day pct_change'].apply(lambda x: 1 if x >= 0 else 0)

    data_yahoo.dropna(axis=0, how='any', inplace=True)

    #let's look at the target variable distribution
    if False:  #scaling isn't all that great for these two target variables
        for col_label in ['Adj Close 1day', 'Adj Close 5day']:
            lam = 0.0001
            #scaler = StandardScaler()
            #data = scaler.fit_transform(data_yahoo[col_label])
            data = data_yahoo[col_label]
            if np.min(data) < 0:
                data = data - np.min(data)
            ''' 
            no scaler:
            1.2165656107790856 -0.06554419693948103 -0.2485500333952623
            1.2147780477183334 -0.06797363864363892 -0.25105816533149256
            
            MinMax:
            1.2165656107790865 0.9905547643484544 -0.6092542377635981
            1.2147780477183334 0.9885749885051007 -0.6115631693413965
            
            Standard:
            1.216565610779086 0.7273346450678947 -0.6463496872857882
            1.214778047718333 0.7258861448434313 -0.6485618913384967
            
            Adj Close 1day - no scaler boxcox
            Adj Close 5day - no scaler boxcox
            '''

            fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 6))
            sns.distplot(data, fit=norm, ax=ax1)
            sns.distplot(boxcox1p(data, lam), fit=norm, ax=ax2)
            sns.distplot(np.log(data + lam), fit=norm, ax=ax3)

            (mu1, sigma1) = norm.fit(data)
            (mu2, sigma2) = norm.fit(boxcox1p(data, lam))
            (mu3, sigma3) = norm.fit(np.log(data + lam))

            ax1.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu1, sigma1), 'Skewness: {:.2f}'.format(skew(data))
            ],
                       loc='best')
            ax2.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu2, sigma2), 'Skewness: {:.2f}'.format(
                        skew(boxcox1p(data, lam)))
            ],
                       loc='best')
            ax3.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu3, sigma3), 'Skewness: {:.2f}'.format(
                        skew(np.log(data + lam)))
            ],
                       loc='best')
            ax1.set_ylabel('Frequency')
            print(skew(data), skew(boxcox1p(data, lam)),
                  skew(np.log(data + lam)))
            ax1.set_title(col_label + ' Distribution')
            ax2.set_title(col_label + ' Box-Cox Transformed')
            ax3.set_title(col_label + ' Log Transformed')
            plt.show()

    if False:  #scalers doesn't really work here either
        for col_label in [
                'Adj Close 1day pct_change', 'Adj Close 5day pct_change'
        ]:
            lam = 0.0001
            #scaler = StandardScaler()
            #data = scaler.fit_transform(data_yahoo[col_label])
            data = data_yahoo[col_label]
            if np.min(data) < 0:
                data = data - np.min(data)
            ''' 
            no scaler:
            -1.6510040307386993 -3.041993709001984 -55.25486882951101
            -0.9408177644672319 -1.8326191132390537 -29.740251304355382
            
            MinMax:
            -1.6510040307386906 -3.7210597124936196 -56.219015977319174
            -0.9408177644672386 -1.928686775469499 -30.170597099885942
            
            Standard:
            -1.6510040307386935 -23.430168647942985 -61.022779357622056
            -0.9408177644672379 -7.476432811167501 -39.192139404540846
            
            Adj Close 1day pct_change - no scaler no transform
            Adj Close 5day pct_change - no scaler no transform
            '''
            fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 6))
            sns.distplot(data, fit=norm, ax=ax1)
            sns.distplot(boxcox1p(data, lam), fit=norm, ax=ax2)
            sns.distplot(np.log(data + lam), fit=norm, ax=ax3)

            (mu1, sigma1) = norm.fit(data)
            (mu2, sigma2) = norm.fit(boxcox1p(data, lam))
            (mu3, sigma3) = norm.fit(np.log(data + lam))

            ax1.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu1, sigma1), 'Skewness: {:.2f}'.format(skew(data))
            ],
                       loc='best')
            ax2.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu2, sigma2), 'Skewness: {:.2f}'.format(
                        skew(boxcox1p(data, lam)))
            ],
                       loc='best')
            ax3.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu3, sigma3), 'Skewness: {:.2f}'.format(
                        skew(np.log(data + lam)))
            ],
                       loc='best')
            ax1.set_ylabel('Frequency')
            print(skew(data), skew(boxcox1p(data, lam)),
                  skew(np.log(data + lam)))
            ax1.set_title(col_label + ' Distribution')
            ax2.set_title(col_label + ' Box-Cox Transformed')
            ax3.set_title(col_label + ' Log Transformed')
            plt.show()

    if False:  #transformations doesn't work
        for col_label in [
                'Adj Close 1day pct_change cls',
                'Adj Close 5day pct_change cls'
        ]:

            lam = 0.0001
            #scaler = StandardScaler()
            #data = scaler.fit_transform(data_yahoo[col_label])
            data = data_yahoo[col_label]
            if np.min(data) < 0:
                data = data - np.min(data)

            fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 6))
            sns.distplot(data, fit=norm, ax=ax1)
            sns.distplot(boxcox1p(data, lam), fit=norm, ax=ax2)
            sns.distplot(np.log(data + lam), fit=norm, ax=ax3)

            (mu1, sigma1) = norm.fit(data)
            (mu2, sigma2) = norm.fit(boxcox1p(data, lam))
            (mu3, sigma3) = norm.fit(np.log(data + lam))

            ax1.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu1, sigma1), 'Skewness: {:.2f}'.format(skew(data))
            ],
                       loc='best')
            ax2.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu2, sigma2), 'Skewness: {:.2f}'.format(
                        skew(boxcox1p(data, lam)))
            ],
                       loc='best')
            ax3.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu3, sigma3), 'Skewness: {:.2f}'.format(
                        skew(np.log(data + lam)))
            ],
                       loc='best')
            ax1.set_ylabel('Frequency')
            print(skew(data), skew(boxcox1p(data, lam)),
                  skew(np.log(data + lam)))
            ax1.set_title(col_label + ' Distribution')
            ax2.set_title(col_label + ' Box-Cox Transformed')
            ax3.set_title(col_label + ' Log Transformed')
            plt.show()

    #let's look at the distribution between each independent variables
    if False:
        for col_label in [
                'Open', 'High', 'Low', 'Range', 'Adj Close', 'Volume',
                'MA5 Adj Close', 'MA5 Volume', 'MA5 Adj Close pct_change',
                'MA5 Volume pct_change'
        ]:
            lam = 0.0001
            MM_Scaler = StandardScaler()
            data = MM_Scaler.fit_transform(data_yahoo[col_label])
            #data = data_yahoo[col_label]
            if np.min(data) < 0:
                data = data - np.min(data)
            '''
            no scaler:
            -5.81225631547921 -9.146269872594456 -62.02985068281074
            2.712481343056322 2.5700673022819003 -0.9250758631542217
            -2.5660281111479226 -2.748341308395658 -33.43627464332718
            2.0711648325652057 1.9393218387203244 0.11413402414840106
            1.2168876669938415 -0.06494740962703247 -0.24793607926031352
            2.964204574183921 -0.053455194367689286 -0.05363840820122992
            1.2162667302493857 -0.06319870260334028 -0.24591172238801545
            1.612323763816607 -0.13175043131958308 -0.13190749719487718
            -1.5597704161777437 -2.5595298869477796 -37.90656860268758
            6.2660356551310485 1.3701314015201278 -1.5385880097432818
            
            MinMax:
            -5.8122563154792015 -11.082155965519698 -62.60899490983156
            2.7124813430563224 2.1099552360244025 -1.5007847275278556
            -2.566028111147925 -3.2923115321333882 -39.97787685974701
            2.071164832565206 1.63981390276237 -0.5591602425043781
            1.216887666993841 0.9909781433323644 -0.6086980942779617
            2.96420457418392 2.1627318198895265 -0.44700320966425255
            1.2162667302493853 0.9896212018472764 -0.6191447177436266
            1.6123237638166064 1.2209628689973062 -0.8754504958756806
            -1.5597704161777388 -2.9341885148669853 -39.23072118617314
            6.26603565513105 4.0674910496926024 -0.6555394360531169

            Standard:
            -5.812256315479212 -43.76335777947685 -65.00115946436995
            2.7124813430563215 0.9592105401947402 -2.3878118031831503
            -2.566028111147922 -7.668800386424772 -47.99647262001495
            2.0711648325652066 0.6949862598819779 -0.7304334255376943
            1.2168876669938415 0.7276509004530627 -0.6458190217616919
            2.9642045741839196 0.7457660928003991 -0.6036534963638037
            1.2162667302493853 0.7279810838194929 -0.6604003387555384
            1.6123237638166061 0.47908958845988775 -0.9756621700306726
            -1.5597704161777395 -11.379256076858951 -47.849396206857875
            6.2660356551310485 0.9336930967122643 -1.872136860642601

            Open - no scaler no transform
            High - StandardScaler boxcox transform
            Low - no scaler no transform
            Range - no scaler log transform
            Adj Close - no scaler boxcox transform
            Volume - no scaler boxcox transform
            MA5 Adj Close - no scaler boxcox transform
            MA5 Volume - no scaler boxcox transform
            MA5 Adj Close pct_change - no scaler no transform
            MA5 Volume pct_change - MinMaxScaler log transform
            '''
            fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 6))
            sns.distplot(data, fit=norm, ax=ax1)
            sns.distplot(boxcox1p(data, lam), fit=norm, ax=ax2)
            sns.distplot(np.log(data + lam), fit=norm, ax=ax3)

            (mu1, sigma1) = norm.fit(data)
            (mu2, sigma2) = norm.fit(boxcox1p(data, lam))
            (mu3, sigma3) = norm.fit(np.log(data + lam))

            ax1.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu1, sigma1), 'Skewness: {:.2f}'.format(skew(data))
            ],
                       loc='best')
            ax2.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu2, sigma2), 'Skewness: {:.2f}'.format(
                        skew(boxcox1p(data, lam)))
            ],
                       loc='best')
            ax3.legend([
                'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(
                    mu3, sigma3), 'Skewness: {:.2f}'.format(
                        skew(np.log(data + lam)))
            ],
                       loc='best')
            print(skew(data), skew(boxcox1p(data, lam)),
                  skew(np.log(data + lam)))
            ax1.set_ylabel('Frequency')
            ax1.set_title(col_label + ' Distribution')
            ax2.set_title(col_label + ' Box-Cox Transformed')
            ax3.set_title(col_label + ' Log Transformed')
            plt.show()

    #so what do we need to transform?
    #no scaler no transform
    #these transofrmations don't need PCA either
    lam = 0.0001
    col_names = [
        'Adj Close 1day pct_change', 'Adj Close 5day pct_change',
        'Adj Close 1day pct_change cls', 'Adj Close 5day pct_change cls',
        'Open', 'Low', 'MA5 Adj Close pct_change'
    ]
    #no scaler, boxcox transform
    col_names = [
        'Adj Close 1day', 'Adj Close 5day', 'Adj Close', 'MA5 Adj Close',
        'MA5 Volume', 'Volume'
    ]
    for col_name in col_names:
        data_yahoo[col_name] = boxcox1p(data_yahoo[col_name], lam)

    #no scaler, log transform
    data_yahoo['Range'] = np.log(data_yahoo['Range'] + lam)
    #StandardScaler, boxcox transform
    SS_scaler = StandardScaler()
    data_yahoo['High'] = boxcox1p(SS_scaler.fit_transform(data_yahoo['High']),
                                  lam)
    #MinMaxScaler, log transform
    MM_scaler = MinMaxScaler()
    data_yahoo['MA5 Volume pct_change'] = np.log(
        MM_scaler.fit_transform(data_yahoo['MA5 Volume pct_change']) + lam)

    #let's look at heatmaps
    if False:  #correlation X vs. ylog
        print(data_yahoo.head(20))
        corrmat = data_yahoo.corr()
        plt.subplots(figsize=(12, 9))
        g = sns.heatmap(corrmat,
                        vmax=0.9,
                        square=True,
                        annot=True,
                        annot_kws={'size': 8})
        g.set_yticklabels(g.get_yticklabels(), rotation=0, fontsize=8)
        g.set_xticklabels(g.get_xticklabels(), rotation=90, fontsize=8)
        plt.title('Correlation Matrix/Heatmap Numerical Features vs. Targets')
        plt.tight_layout()
        plt.show()

    #let's also try PCA
    train = data_yahoo[[
        'Open', 'High', 'Low', 'Range', 'Adj Close', 'Volume', 'MA5 Adj Close',
        'MA5 Volume', 'MA5 Adj Close pct_change', 'MA5 Volume pct_change'
    ]]

    if pca == None:
        pca = PCA(n_components=7)
        trainPCA = pd.DataFrame(pca.fit_transform(train))

    else:
        trainPCA = pd.DataFrame(pca.transform(train))

    PCA_data_yahoo = pd.DataFrame.copy(trainPCA)
    PCA_data_yahoo.columns = ['Dimension ' + str(i) for i in range(1, 8)]
    for target in [
            'Adj Close 1day', 'Adj Close 5day', 'Adj Close 1day pct_change',
            'Adj Close 5day pct_change', 'Adj Close 1day pct_change cls',
            'Adj Close 5day pct_change cls'
    ]:
        PCA_data_yahoo[target] = data_yahoo.reset_index()[target]

    if False:  #show PCA results, cumulative power, and heatmap
        pca_results = vs.pca_results(train, pca)
        plt.show()
        ys = pca.explained_variance_ratio_
        xs = np.arange(1, len(ys) + 1)
        plt.plot(xs, np.cumsum(ys), '-o')
        for label, x, y in zip(np.cumsum(ys), xs, np.cumsum(ys)):
            plt.annotate('{:.2f}%'.format(label * 100),
                         xy=(x, y),
                         xytext=(30, -20),
                         textcoords='offset points',
                         ha='right',
                         va='bottom',
                         bbox=dict(boxstyle='round,pad=0.5',
                                   fc='yellow',
                                   alpha=0.5),
                         arrowprops=dict(arrowstyle='->',
                                         connectionstyle='arc3,rad=0'))

        plt.ylabel('Cumulative Explained Variance')
        plt.xlabel('Dimensions')
        plt.title('PCA - Total Explained Variance by # fo Dimensions')
        plt.tight_layout()
        plt.show()

        g = sns.heatmap(temp.corr(), annot=True, annot_kws={'size': 8})
        g.set_yticklabels(g.get_yticklabels(), rotation=0, fontsize=8)
        g.set_xticklabels(g.get_xticklabels(), rotation=90, fontsize=8)
        plt.title('PCA Correlation Matrix/Heatmap')
        plt.tight_layout()
        #plt.savefig('Charts/PCA heatmap.png')
        plt.show()

    #export pca, pca dataset, and original dataset
    return pca, PCA_data_yahoo, data_yahoo
예제 #6
0
        tpvafs[recurrentvafs3] = l3
    else:
        tpvafs[recurrentvafs3] = [0, 0, vaf3]

df = pd.DataFrame.from_items(tpvafs.items(),
                             orient='index',
                             columns=['1', '2', '3'])
display(df.describe())
df = df.astype(float)
display(df.describe())
pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(14, 8), diagonal='kde')

pca = PCA(n_components=3)  # random_state only available from 0.18.0 onwards
pca.fit(df)

pca_results = vs.pca_results(df, pca)
#vs.biplot(df, reduced_data, pca)
#pca = PCA(n_components=2)
#pca.fit(df)

reduced_data = pca.transform(df)
#reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])
#components=[2,3,4,5,6,7,8,9,10,11,12,14,20]
components = [9]
All_Scores = []
for i in range(len(components)):
    clusterer = GMM(n_components=components[i],
                    init_params='random').fit(reduced_data)

    # TODO: Predict the cluster for each data point
    preds = clusterer.predict(reduced_data)
예제 #7
0
testing_features = features[october_1:]
testing_labels = labels[october_1:]

# Generate reduced features using PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
reduced_training_features = pca.fit_transform(training_features)
reduced_testing_features = pca.fit_transform(testing_features)

# Visualize first 10 principal components of features
import visuals as vs
featuresDF = pd.DataFrame(data=features)
pca = PCA(n_components=10)
pca.fit(featuresDF)
pca_samples = pca.transform(featuresDF)
pca_results = vs.pca_results(featuresDF, pca)
 
# Benchmark Model
from sklearn.metrics import mean_absolute_error
preds = [np.mean(training_labels)] * len(testing_labels)
print 'Benchmark Results:'
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
# Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(training_features, training_labels)
preds = lin_reg.predict(testing_features)
score = lin_reg.score(testing_features,testing_labels)
print 'Linear Regression Results:'
print 'R2 score:', score
# Remove the data point with only 1 feature which is outlier
good_data = log_data.copy()
good_data = good_data.drop(log_data.index[outliers])

#####################################################################################################
# TODO: Apply PCA by fitting the good data with the same number of dimensions as features
from sklearn.decomposition import PCA
pca = PCA(n_components=6)
good_data = pca.fit_transform(good_data)

# TODO: Transform log_samples using the PCA fit above
pca_samples = log_samples.copy()
pca_samples = pca.transform(pca_samples)

# Generate PCA results plot
pca_results = vs.pca_results(pd.DataFrame(pca_samples), pca)

#####################################################################################################

# Display sample log-data after having a PCA transformation applied
display(
    pd.DataFrame(np.round(pca_samples, 4), columns=pca_results.index.values))

#####################################################################################################

# TODO: Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components=2)

# TODO: Transform the good data using the PCA fit above
reduced_data = pca.fit_transform(good_data)