Пример #1
0
 def main(self):
     """
 Pulls stock data for the ticker symbols from a json file and pull data from quandl, preprocesses the data
 and then build different supervised learning machine learning models and predicts future stock price.
 :return: None
 """
     logger.info(
         "------------------Started Stock Price Prediction-----------------"
     )
     # Create instances of all the classes used for stock prediction
     get_data = GetData(api_key=sys.argv[1])
     # Number of dates/data points into the future for which the stock price is to be predicted as a percentage of the
     # number of dates/data points for which historical data which is already available
     future_prediction_pcnt = 1
     preprocess_data = PreprocessData(
         future_prediction_pcnt=future_prediction_pcnt)
     build_models = BuildModels()
     forecast_prices = Predictions()
     # Get data from quandl.
     df = get_data.get_stock_data(update_data=False)
     # Preprocess data
     preprocessed_data_dict, original_df_dict = preprocess_data.preprocess_data(
         df, get_data.stock_ticker_list)
     models_list = [
         "Linear Regression", "Decision Tree Regressor",
         "Random Forest Regressor"
     ]
     # Build models
     models_dict, model_scores_dict = build_models.build_models(
         models_list, preprocessed_data_dict, force_build=False)
     # Predict future stock prices
     forecast_df_dict = forecast_prices.make_predictions(
         models_dict, preprocessed_data_dict, original_df_dict)
     self.plot_forecast(forecast_df_dict, original_df_dict,
                        future_prediction_pcnt)
    def make_prediction(self, model_name, model_for_each_ticker_dict,
                        preprocessed_data_dict, original_df_dict):
        """
		Make future stock price prediction.
		:param model_name: str, name of the model.
		:param model_for_each_ticker_dict: dict.
		:param preprocessed_data_dict: dict.
		:param original_df_dict: dict.
		:return:
		"""
        logger.info(
            "----------------Predicting future prices using the {} model----------------"
            .format(model_name))
        forecast_df_dict = {}
        for ticker_symbol, model in model_for_each_ticker_dict.items():
            ticker_symbol = ticker_symbol.replace("_", "/")
            logger.info(
                "Predicting future prices for {}".format(ticker_symbol))
            df_copy = original_df_dict[ticker_symbol].copy(deep=True)
            df_copy.dropna(inplace=True)
            X_forecast = preprocessed_data_dict[ticker_symbol][1]
            logger.debug("len(X_forecast) = {}".format(len(X_forecast)))
            forecast_set = model.predict(X_forecast)
            df_copy["{} - Forecast".format(ticker_symbol)] = forecast_set
            forecast_df_dict[ticker_symbol] = df_copy
        return forecast_df_dict
Пример #3
0
	def get_built_models(self):
		"""
		Get the models if they are already build.
		:return built_models_dict: dictionary containing model names as keys and built model objects as values.
		"""
		if self.built_models_dict:
			return self.built_models_dict
		else:
			logger.info("No models found. Run build_models first and then call this method.")
			exit(1)
Пример #4
0
	def load_from_pickle_file(self, model_name, ticker_symbol, obj_name):
		"""
		Load the built model from a pickle file.
		:param model_name: str, name of the model.
		:param ticker_symbol: str, ticker symbol.
		:param obj_name: str, name of the built model object.
		:return loaded_obj: object, model object.
		"""
		logger.info("Loading {} model for {} from pickle file".format(model_name, ticker_symbol))
		pickle_in = open("{}/{}_{}_{}.pickle".format(
			self.saved_models_dir, model_name, ticker_symbol, obj_name), "rb")
		loaded_obj = pickle.load(pickle_in)
		return loaded_obj
Пример #5
0
	def save_to_pickle_file(self, model_name, ticker_symbol, obj_to_be_saved, obj_name):
		"""
		Save the built model to a pickle file.
		:param model_name: str, name of the model.
		:param ticker_symbol: str, ticker symbol.
		:param obj_to_be_saved: object, model object.
		:param obj_name: str, name of the built model object.
		:return None:
		"""
		logger.info("Saving {} model for {} to pickle file".format(model_name, ticker_symbol))
		pickle_out = open("{}/{}_{}_{}.pickle".format(
			self.saved_models_dir, model_name, ticker_symbol, obj_name), "wb")
		pickle.dump(obj_to_be_saved, pickle_out)
		pickle_out.close()
Пример #6
0
 def plot_forecast(self,
                   forecast_df_dict,
                   original_df_dict,
                   future_prediction_pcnt=1):
     """
 Plots the actual data and the forecast data in the dataframe.
 :param forecast_df_dict: dict, dictionary containing model names as keys and dictionaries containing
  ticker symbols as keys and preprocessed dataframes containing forecast data as values.
 :param original_df_dict: dict, dictionary containing ticker symbols as keys and original dataframes as values.
 :param future_prediction_pcnt: float, Number of dates/data points into the future for which the stock price is to
 be predicted as a percentage of the number of dates/data points for which historical data which is already
 available
 :return: None
 """
     for model_name, df_dict in forecast_df_dict.items():
         logger.info(
             "----------------Plotting stock prices for {} model----------------"
             .format(model_name))
         for ticker_symbol, df in df_dict.items():
             ticker_domain = ticker_symbol.split("/")[0]
             original_df = original_df_dict[ticker_symbol].dropna(
             ).reset_index()
             df = df.reset_index()
             forecast_col_labels = {
                 "WIKI": "{} - Adj. Close".format(ticker_symbol),
                 "BCB": "{} - Value".format(ticker_symbol),
                 "NASDAQOMX": "{} - Index Value".format(ticker_symbol)
             }
             logger.info(
                 "----------------Plotting stock prices for {}".format(
                     ticker_symbol))
             # Number of future data points to be predicted.
             forecast_out = int(
                 math.ceil(future_prediction_pcnt * 0.01 * len(df)))
             original_df["Date"] = original_df["Date"].shift(-forecast_out)
             df["{} - Forecast".format(ticker_symbol)].plot(color='b')
             original_df[forecast_col_labels[ticker_domain]].plot(color='g')
             plt.legend(loc="best")
             plt.xlabel("Date")
             plt.ylabel("Price")
             plt.title("Forecast for {} model for {}".format(
                 model_name, ticker_symbol))
             # fig = plt.figure()
             plt.savefig("{}/{}_{}.png".format(
                 self.stock_price_plots_dir, model_name,
                 ticker_symbol.replace("/", "_")))
             plt.clf()
             plt.close()
Пример #7
0
	def build_model(self, model_name, preprocessed_data_dict, force_build):
		"""
		Build machine learning models using different supervised learning regression algorithms
		:param model_name: str, name of the model to be built.
		:param preprocessed_data_dict: dict.
		:param force_build: bool, if True, will force the function to build the model, even if there is a saved model
		which was built before that is available.
		:return model_dict: dict, dictionary containing model name as key and the built model object as value.
		:return model_scores_dict: dictionary containing model name as key and the model training score as value.
		"""
		logger.info("----------------Building model using {}----------------".format(model_name))
		model_dict = {}
		model_scores_dict = {}
		curr_dir = os.getcwd()
		for ticker_symbol, preprocessed_data in preprocessed_data_dict.items():
			[X, X_forecast, y] = preprocessed_data
			tscv = TimeSeriesSplit(n_splits=5)
			ticker_symbol = ticker_symbol.replace("/", "_")
			if force_build or not os.path.exists(
					"{}/{}_{}_model.pickle".format(self.saved_models_path, model_name,	ticker_symbol)):
				# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
				# Create a cv iterator for splitting train and test data using TimeSeriesSplit
				# Optimize the hyperparameters based on the cross validation scores
				optimized_model = self.optimize_hyperparameters(model_name, tscv)
				model = make_pipeline(StandardScaler(), optimized_model)
				X_train, X_test, y_train, y_test = self.get_train_and_test_data(X, y, tscv)
				model.fit(X_train, y_train)
				self.save_to_pickle_file(model_name, ticker_symbol, model, "model")
			else:
				model = self.load_from_pickle_file(model_name, ticker_symbol, "model")
				X_train, X_test, y_train, y_test = self.get_train_and_test_data(X, y, tscv)
			# Training score
			confidence_score = model.score(X_test, y_test)
			# Plot learning curves
			title = "{}_{}_Learning Curves".format(model_name, ticker_symbol)
			save_file_path = "{}/learning_curve_plots/{}_{}.png".format(curr_dir, model_name, ticker_symbol)
			# Create the CV iterator
			self.plot_learning_curve(model, title, X, y, save_file_path, cv=tscv)
			# Cross validation
			cv_scores = cross_validate(model, X=X, y=y, cv=tscv)
			logger.info("Training score for {} = {}".format(ticker_symbol, confidence_score))
			logger.debug("Cross validation scores for {} = {}".format(ticker_symbol, cv_scores["test_score"]))
			logger.info("Cross validation score for {} = {} +/- {}".format(
				ticker_symbol, cv_scores["test_score"].mean(), cv_scores["test_score"].std() * 2))
			logger.debug("Cross validation scoring time = {}s".format(cv_scores["score_time"].sum()))
			model_dict[ticker_symbol] = model
			model_scores_dict[ticker_symbol] = confidence_score
		return model_dict, model_scores_dict
Пример #8
0
 def get_stock_data(self, update_data=False):
     """
 Get stock data for the ticker symbols in the json file (stockdata/stockdatainfo.json) from quandl
 :param update_data: bool, tells the function whether to pull data everytime or not.
 :return df: Dataframe
 """
     logger.info(
         "----------------Getting stock data from Quandl----------------")
     logger.info("Stock ticker list = {}".format(self._stock_ticker_list))
     # df = quandl.get("WIKI/GOOGL")
     # Pull data if stockdata/stockdata.csv does not exist or if update_data is True.
     if update_data or not os.path.exists("{}/{}".format(
             os.getcwd(), self.stock_data_path)):
         df = quandl.get(self._stock_ticker_list)
         logger.info("Writing stock data to {}".format(
             self.stock_data_path))
         # Write the dataframe to a csv fle
         df.to_csv("{}".format(self.stock_data_path))
     logger.info("Reading stock data from {}".format(self.stock_data_path))
     # df = pd.read_csv("{}".format(self.stock_data_path), index_col="Date")
     # Read the data from the csv file
     df = pd.read_csv("{}".format(self.stock_data_path))
     logger.debug("df.shape = {}".format(df.shape))
     return df
Пример #9
0
    def preprocess_data(self, df, ticker_symbol_list):
        """
		Preprocess stock data
		:param df: dataframe, original dataframe
		:param ticker_symbol_list: list, list of ticker symbols
		:return preprocessed_data_dict: dict, dictionary with ticker symbols as keys, preprocessed stock data dataframes as
		values
		:return original_df_dict: dict, dictionary with ticker symbols as keys, original stock data dataframes as values
		"""
        self.ticker_symbol_list = ticker_symbol_list
        logger.info("----------------Pre-processing data----------------")
        # Extract data frames for each ticker from the original data frame and put it in a dictionary.
        self.get_df_for_each_ticker(df)
        useful_features = ["Adj. Close", "HL_PCT", "PCT_change", "Adj. Volume"]
        for ticker_symbol, original_df in self.original_df_dict.items():
            ticker_domain = ticker_symbol.split("/")[0]
            feature_list = self.get_feature_list(ticker_domain)
            logger.debug("Feature list for {} = {}".format(
                ticker_symbol, feature_list))
            preprocessed_feature_list = list(
                map(lambda x, x1: "{} - {}".format(x, x1),
                    [ticker_symbol] * len(feature_list), feature_list))
            preprocessed_df = original_df[preprocessed_feature_list].copy(
                deep=True)
            if ticker_domain in ["WIKI"]:
                # Compute high to low and open to close stock price percentage values and add them to feature list
                preprocessed_df = self.get_high_to_low_pcnt_change(
                    preprocessed_df, ticker_symbol)
                preprocessed_df = self.get_open_to_close_pcnt_change(
                    preprocessed_df, ticker_symbol)
                preprocessed_feature_list = list(
                    map(lambda x, x1: "{} - {}".format(x, x1),
                        [ticker_symbol] * len(useful_features),
                        useful_features))
                preprocessed_df = preprocessed_df[preprocessed_feature_list]
            # Forecast column labels depending on the domain
            forecast_col_labels = {
                "WIKI": "{} - Adj. Close".format(ticker_symbol),
                "BCB": "{} - Value".format(ticker_symbol),
                "NASDAQOMX": "{} - Index Value".format(ticker_symbol)
            }
            preprocessed_df.dropna(inplace=True)
            preprocessed_df["label"] = preprocessed_df[
                forecast_col_labels[ticker_domain]]
            X_forecast = np.array(preprocessed_df.drop(["label"], 1))
            # Number of future data points to be predicted.
            forecast_out = int(
                math.ceil(self.future_prediction_pcnt * 0.01 *
                          len(preprocessed_df)))
            preprocessed_df = preprocessed_df.iloc[
                0:int((1 - self.future_prediction_pcnt * 0.01) *
                      len(preprocessed_df)), :]
            preprocessed_df["label"] = preprocessed_df["label"].shift(
                -forecast_out)
            preprocessed_df.dropna(inplace=True)
            X = np.array(preprocessed_df.drop(["label"], 1))
            X = X[:-forecast_out]
            y = np.array(preprocessed_df["label"])
            y = y[:-forecast_out]
            self.preprocessed_data_dict[ticker_symbol] = [X, X_forecast, y]
        return self.preprocessed_data_dict, self.original_df_dict
Пример #10
0
	def plot_learning_curve(self, estimator, title, X, y, save_file_path, ylim=None, cv=None,
													train_sizes=np.linspace(.1, 1.0, 5)):
		"""
		Generate a simple plot of the test and training learning curve.

		Parameters
		----------
		estimator : object type that implements the "fit" and "predict" methods
				An object of that type which is cloned for each validation.

		title : string
				Title for the chart.

		X : array-like, shape (n_samples, n_features)
				Training vector, where n_samples is the number of samples and
				n_features is the number of features.

		y : array-like, shape (n_samples) or (n_samples, n_features), optional
				Target relative to X for classification or regression;
				None for unsupervised learning.

		ylim : tuple, shape (ymin, ymax), optional
				Defines minimum and maximum yvalues plotted.

		cv : int, cross-validation generator or an iterable, optional
				Determines the cross-validation splitting strategy.
				Possible inputs for cv are:
					- None, to use the default 3-fold cross-validation,
					- integer, to specify the number of folds.
					- An object to be used as a cross-validation generator.
					- An iterable yielding train/test splits.

				For integer/None inputs, if ``y`` is binary or multiclass,
				:class:`StratifiedKFold` used. If the estimator is not a classifier
				or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

				Refer :ref:`User Guide <cross_validation>` for the various
				cross-validators that can be used here.
		train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
		"""
		logger.info("Plotting {}".format(title))
		plt.figure()
		plt.title(title)
		if ylim is not None:
			plt.ylim(*ylim)
		plt.xlabel("Training examples")
		plt.ylabel("Score")
		train_sizes, train_scores, test_scores = learning_curve(
			estimator, X, y, cv=cv, train_sizes=train_sizes)
		train_scores_mean = np.mean(train_scores, axis=1)
		train_scores_std = np.std(train_scores, axis=1)
		test_scores_mean = np.mean(test_scores, axis=1)
		test_scores_std = np.std(test_scores, axis=1)
		plt.grid()

		plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1,
										 color="r")
		plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1,
										 color="g")
		plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
		plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

		plt.legend(loc="best")
		plt.savefig("{}".format(save_file_path))
		plt.close()