Пример #1
0
class DataEngine:
    def __init__(self, history_to_use, data_granularity_minutes, is_save_dict,
                 is_load_dict, dict_path, min_volume_filter, is_test,
                 future_bars_for_testing, volatility_filter):
        print("Data engine has been initialized...")
        self.DATA_GRANULARITY_MINUTES = data_granularity_minutes
        self.IS_SAVE_DICT = is_save_dict
        self.IS_LOAD_DICT = is_load_dict
        self.DICT_PATH = dict_path
        self.VOLUME_FILTER = min_volume_filter
        self.FUTURE_FOR_TESTING = future_bars_for_testing
        self.IS_TEST = is_test
        self.VOLATILITY_THRESHOLD = volatility_filter

        # Stocks list
        self.directory_path = str(os.path.dirname(os.path.abspath(__file__)))
        self.stocks_file_path = self.directory_path + "/stocks/stocks.txt"
        self.stocks_list = []

        # Load stock names in a list
        self.load_stocks_from_file()

        # Load Technical Indicator engine
        self.taEngine = TAEngine(history_to_use=history_to_use)

        # Dictionary to store data. This will only store and save data if the argument is_save_dictionary is 1.
        self.features_dictionary_for_all_symbols = {}

        # Data length
        self.stock_data_length = []

    def load_stocks_from_file(self):
        """
		Load stock names from the file
		"""
        print("Loading all stocks from file...")
        stocks_list = open(self.stocks_file_path, "r").readlines()
        stocks_list = [str(item).strip("\n") for item in stocks_list]

        # Load symbols
        stocks_list = list(sorted(set(stocks_list)))
        print("Total number of stocks: %d" % len(stocks_list))
        self.stocks_list = stocks_list

    def get_most_frequent_key(self, input_list):
        counter = collections.Counter(input_list)
        counter_keys = list(counter.keys())
        frequent_key = counter_keys[0]
        return frequent_key

    def get_data(self, symbol):
        """
		Get stock data from yahoo finance.
		"""

        # Find period
        if self.DATA_GRANULARITY_MINUTES == 1:
            period = "7d"
        else:
            period = "30d"

        # Get stock price
        try:
            # Stock price
            stock_prices = yf.download(
                tickers=symbol,
                period=period,
                interval=str(self.DATA_GRANULARITY_MINUTES) + "m",
                auto_adjust=False,
                progress=False)
            stock_prices = stock_prices.reset_index()
            stock_prices = stock_prices[[
                'Datetime', 'Open', 'High', 'Low', 'Close', 'Volume'
            ]]
            data_length = len(stock_prices.values.tolist())
            self.stock_data_length.append(data_length)

            # After getting some data, ignore partial data from yfinance based on number of data samples
            if len(self.stock_data_length) > 5:
                most_frequent_key = self.get_most_frequent_key(
                    self.stock_data_length)
                if data_length != most_frequent_key:
                    return [], [], True

            if self.IS_TEST == 1:
                stock_prices_list = stock_prices.values.tolist()
                stock_prices_list = stock_prices_list[
                    1:]  # For some reason, yfinance gives some 0 values in the first index
                future_prices_list = stock_prices_list[-(
                    self.FUTURE_FOR_TESTING + 1):]
                historical_prices = stock_prices_list[:-self.
                                                      FUTURE_FOR_TESTING]
                historical_prices = pd.DataFrame(historical_prices)
                historical_prices.columns = [
                    'Datetime', 'Open', 'High', 'Low', 'Close', 'Volume'
                ]
            else:
                # No testing
                stock_prices_list = stock_prices.values.tolist()
                stock_prices_list = stock_prices_list[1:]
                historical_prices = pd.DataFrame(stock_prices_list)
                historical_prices.columns = [
                    'Datetime', 'Open', 'High', 'Low', 'Close', 'Volume'
                ]
                future_prices_list = []

            if len(stock_prices.values.tolist()) == 0:
                return [], [], True
        except:
            return [], [], True

        return historical_prices, future_prices_list, False

    def calculate_volatility(self, stock_price_data):
        CLOSE_PRICE_INDEX = 4
        stock_price_data_list = stock_price_data.values.tolist()
        close_prices = [
            item[CLOSE_PRICE_INDEX] for item in stock_price_data_list
        ]
        close_prices = [item for item in close_prices if item != 0]
        volatility = np.std(close_prices)
        return volatility

    def collect_data_for_all_tickers(self):
        """
		Iterates over all symbols and collects their data
		"""

        print("Loading data for all stocks...")
        features = []
        symbol_names = []
        historical_price_info = []
        future_price_info = []

        # Any stock with very low volatility is ignored. You can change this line to address that.
        for i in tqdm(range(len(self.stocks_list))):
            symbol = self.stocks_list[i]
            try:
                stock_price_data, future_prices, not_found = self.get_data(
                    symbol)

                if not not_found:
                    volatility = self.calculate_volatility(stock_price_data)

                    # Filter low volatility stocks
                    if volatility < self.VOLATILITY_THRESHOLD:
                        continue

                    features_dictionary = self.taEngine.get_technical_indicators(
                        stock_price_data)
                    feature_list = self.taEngine.get_features(
                        features_dictionary)

                    # Add to dictionary
                    self.features_dictionary_for_all_symbols[symbol] = {
                        "features": features_dictionary,
                        "current_prices": stock_price_data,
                        "future_prices": future_prices
                    }

                    if np.isnan(feature_list).any() == True:
                        continue

                    # Check for volume
                    average_volume_last_30_tickers = np.mean(
                        list(stock_price_data["Volume"])[-30:])
                    if average_volume_last_30_tickers < self.VOLUME_FILTER:
                        continue

                    # Add to lists
                    features.append(feature_list)
                    symbol_names.append(symbol)
                    historical_price_info.append(stock_price_data)
                    future_price_info.append(future_prices)

                    # Save dictionary after every 100 symbols
                    if len(self.features_dictionary_for_all_symbols
                           ) % 100 == 0 and self.IS_SAVE_DICT == 1:
                        np.save(self.DICT_PATH,
                                self.features_dictionary_for_all_symbols)

            except Exception as e:
                print("Exception", e)
                continue

        # Sometimes, there are some errors in feature generation or price extraction, let us remove that stuff
        features, historical_price_info, future_price_info, symbol_names = self.remove_bad_data(
            features, historical_price_info, future_price_info, symbol_names)

        return features, historical_price_info, future_price_info, symbol_names

    def load_data_from_dictionary(self):
        # Load data from dictionary
        print("Loading data from dictionary")
        dictionary_data = np.load(self.DICT_PATH, allow_pickle=True).item()

        features = []
        symbol_names = []
        historical_price_info = []
        future_price_info = []
        for symbol in dictionary_data:
            feature_list = self.taEngine.get_features(
                dictionary_data[symbol]["features"])
            current_prices = dictionary_data[symbol]["current_prices"]
            future_prices = dictionary_data[symbol]["future_prices"]

            # Check if there is any null value
            if np.isnan(feature_list).any() == True:
                continue

            features.append(feature_list)
            symbol_names.append(symbol)
            historical_price_info.append(current_prices)
            future_price_info.append(future_prices)

        # Sometimes, there are some errors in feature generation or price extraction, let us remove that stuff
        features, historical_price_info, future_price_info, symbol_names = self.remove_bad_data(
            features, historical_price_info, future_price_info, symbol_names)

        return features, historical_price_info, future_price_info, symbol_names

    def remove_bad_data(self, features, historical_price_info,
                        future_price_info, symbol_names):
        """
		Remove bad data i.e data that had some errors while scraping or feature generation
		"""
        length_dictionary = collections.Counter(
            [len(feature) for feature in features])
        length_dictionary = list(length_dictionary.keys())
        most_common_length = length_dictionary[0]

        filtered_features, filtered_historical_price, filtered_future_prices, filtered_symbols = [], [], [], []
        for i in range(0, len(features)):
            if len(features[i]) == most_common_length:
                filtered_features.append(features[i])
                filtered_symbols.append(symbol_names[i])
                filtered_historical_price.append(historical_price_info[i])
                filtered_future_prices.append(future_price_info[i])

        return filtered_features, filtered_historical_price, filtered_future_prices, filtered_symbols
Пример #2
0
class DataEngine:
    def __init__(self, history_to_use, data_granularity_minutes, is_save_dict,
                 is_load_dict, dict_path, min_volume_filter, is_test,
                 future_bars_for_testing, volatility_filter, stocks_list,
                 data_source):
        print("Data engine has been initialized...")
        self.DATA_GRANULARITY_MINUTES = data_granularity_minutes
        self.IS_SAVE_DICT = is_save_dict
        self.IS_LOAD_DICT = is_load_dict
        self.DICT_PATH = dict_path
        self.VOLUME_FILTER = min_volume_filter
        self.FUTURE_FOR_TESTING = future_bars_for_testing
        self.IS_TEST = is_test
        self.VOLATILITY_THRESHOLD = volatility_filter
        self.DATA_SOURCE = data_source

        # Stocks list
        self.directory_path = str(os.path.dirname(os.path.abspath(__file__)))
        self.stocks_file_path = self.directory_path + f"/stocks/{stocks_list}"
        self.stocks_list = []

        # Load stock names in a list
        self.load_stocks_from_file()

        # Load Technical Indicator engine
        self.taEngine = TAEngine(history_to_use=history_to_use)

        # Dictionary to store data. This will only store and save data if the argument is_save_dictionary is 1.
        self.features_dictionary_for_all_symbols = {}

        # Data length
        self.stock_data_length = []

        # Create an instance of the Binance Client with no api key and no secret (api key and secret not required for the functionality needed for this script)
        self.binance_client = Client("", "")

    def load_stocks_from_file(self):
        """
		Load stock names from the file
		"""
        print("Loading all stocks from file...")
        stocks_list = open(self.stocks_file_path, "r").readlines()
        stocks_list = [
            str(item).strip("\n") for item in stocks_list if item != "\n"
        ]

        # Load symbols
        stocks_list = list(sorted(set(stocks_list)))
        print("Total number of stocks: %d" % len(stocks_list))
        self.stocks_list = stocks_list

    def get_most_frequent_key(self, input_list):
        counter = collections.Counter(input_list)
        counter_keys = list(counter.keys())
        frequent_key = counter_keys[0]
        return frequent_key

    def get_data(self, symbol, drop_last_bar):
        """
		Get stock data.
		"""

        # Find period
        if self.DATA_GRANULARITY_MINUTES == 1:
            period = "7d"
        else:
            period = "30d"

        try:
            # get crytpo price from Binance
            if (self.DATA_SOURCE == 'binance'):
                # Binance clients doesn't like 60m as an interval
                if (self.DATA_GRANULARITY_MINUTES == 60):
                    interval = '1h'
                else:
                    interval = str(self.DATA_GRANULARITY_MINUTES) + "m"
                stock_prices = self.binance_client.get_klines(
                    symbol=symbol, interval=interval)
                # ensure that stock prices contains some data, otherwise the pandas operations below could fail
                if len(stock_prices) == 0:
                    return [], [], True
                # convert list to pandas dataframe
                stock_prices = pd.DataFrame(stock_prices,
                                            columns=[
                                                'Datetime', 'Open', 'High',
                                                'Low', 'Close', 'Volume',
                                                'close_time', 'quote_av',
                                                'trades', 'tb_base_av',
                                                'tb_quote_av', 'ignore'
                                            ])
                stock_prices['Datetime'] = stock_prices['Datetime'].astype(
                    float)
                stock_prices['Open'] = stock_prices['Open'].astype(float)
                stock_prices['High'] = stock_prices['High'].astype(float)
                stock_prices['Low'] = stock_prices['Low'].astype(float)
                stock_prices['Close'] = stock_prices['Close'].astype(float)
                stock_prices['Volume'] = stock_prices['Volume'].astype(float)
            # get stock prices from yahoo finance
            else:
                stock_prices = yf.download(
                    tickers=symbol,
                    period=period,
                    interval=str(self.DATA_GRANULARITY_MINUTES) + "m",
                    auto_adjust=False,
                    progress=False)
            stock_prices = stock_prices.reset_index()
            stock_prices = stock_prices[[
                'Datetime', 'Open', 'High', 'Low', 'Close', 'Volume'
            ]]

            if drop_last_bar:
                stock_prices = stock_prices.loc[0:len(stock_prices.index) - 2]

            data_length = len(stock_prices.values.tolist())
            self.stock_data_length.append(data_length)

            # After getting some data, ignore partial data based on number of data samples
            if len(self.stock_data_length) > 5:
                most_frequent_key = self.get_most_frequent_key(
                    self.stock_data_length)
                if data_length != most_frequent_key:
                    return [], [], True

            if self.IS_TEST == 1:
                stock_prices_list = stock_prices.values.tolist()
                stock_prices_list = stock_prices_list[
                    1:]  # For some reason, yfinance gives some 0 values in the first index
                future_prices_list = stock_prices_list[-(
                    self.FUTURE_FOR_TESTING + 1):]
                historical_prices = stock_prices_list[:-self.
                                                      FUTURE_FOR_TESTING]
                historical_prices = pd.DataFrame(historical_prices)
                historical_prices.columns = [
                    'Datetime', 'Open', 'High', 'Low', 'Close', 'Volume'
                ]
            else:
                # No testing
                stock_prices_list = stock_prices.values.tolist()
                stock_prices_list = stock_prices_list[1:]
                historical_prices = pd.DataFrame(stock_prices_list)
                historical_prices.columns = [
                    'Datetime', 'Open', 'High', 'Low', 'Close', 'Volume'
                ]
                future_prices_list = []

            if len(stock_prices.values.tolist()) == 0:
                return [], [], True
        except:
            return [], [], True

        return historical_prices, future_prices_list, False

    def calculate_volatility(self, stock_price_data):
        CLOSE_PRICE_INDEX = 4
        stock_price_data_list = stock_price_data.values.tolist()
        close_prices = [
            float(item[CLOSE_PRICE_INDEX]) for item in stock_price_data_list
        ]
        close_prices = [item for item in close_prices if item != 0]
        volatility = np.std(close_prices)
        return volatility

    def is_nse_open(self):
        utc_time = datetime.utcnow()
        utc_timezone = pytz.timezone("UTC")
        ist_timezone = pytz.timezone("Asia/Kolkata")
        ist_time = None

        utc_time = utc_timezone.localize(utc_time)
        ist_time = utc_time.astimezone(ist_timezone)

        return dt.time(9, 15) <= ist_time.time() <= dt.time(15, 30)

    def collect_data_for_all_tickers(self):
        """
		Iterates over all symbols and collects their data
		"""

        print("Loading data for all stocks...")
        drop_last_bar = self.is_nse_open()
        if drop_last_bar:
            print(
                "The NSE is open: dropping most recent bar because it is likely shorter than {}m!"
                .format(self.DATA_GRANULARITY_MINUTES))

        features = []
        symbol_names = []
        historical_price_info = []
        future_price_info = []

        # Any stock with very low volatility is ignored. You can change this line to address that.
        for i in tqdm(range(len(self.stocks_list))):
            symbol = self.stocks_list[i]
            try:
                stock_price_data, future_prices, not_found = self.get_data(
                    symbol, drop_last_bar)

                if not not_found:
                    volatility = self.calculate_volatility(stock_price_data)

                    # Filter low volatility stocks
                    if volatility < self.VOLATILITY_THRESHOLD:
                        continue

                    features_dictionary = self.taEngine.get_technical_indicators(
                        stock_price_data)
                    feature_list = self.taEngine.get_features(
                        features_dictionary)

                    # Add to dictionary
                    self.features_dictionary_for_all_symbols[symbol] = {
                        "features": features_dictionary,
                        "current_prices": stock_price_data,
                        "future_prices": future_prices
                    }

                    # Save dictionary after every 100 symbols
                    if len(self.features_dictionary_for_all_symbols
                           ) % 100 == 0 and self.IS_SAVE_DICT == 1:
                        np.save(self.DICT_PATH,
                                self.features_dictionary_for_all_symbols)

                    if np.isnan(feature_list).any() == True:
                        continue

                    # Check for volume
                    average_volume_last_30_tickers = np.mean(
                        list(stock_price_data["Volume"])[-30:])
                    if average_volume_last_30_tickers < self.VOLUME_FILTER:
                        continue

                    # Add to lists
                    features.append(feature_list)
                    symbol_names.append(symbol)
                    historical_price_info.append(stock_price_data)
                    future_price_info.append(future_prices)

            except Exception as e:
                print("Exception", e)
                continue

        # Sometimes, there are some errors in feature generation or price extraction, let us remove that stuff
        features, historical_price_info, future_price_info, symbol_names = self.remove_bad_data(
            features, historical_price_info, future_price_info, symbol_names)

        return features, historical_price_info, future_price_info, symbol_names

    def load_data_from_dictionary(self):
        # Load data from dictionary
        print("Loading data from dictionary")
        dictionary_data = np.load(self.DICT_PATH, allow_pickle=True).item()

        features = []
        symbol_names = []
        historical_price_info = []
        future_price_info = []
        for symbol in dictionary_data:
            feature_list = self.taEngine.get_features(
                dictionary_data[symbol]["features"])
            current_prices = dictionary_data[symbol]["current_prices"]
            future_prices = dictionary_data[symbol]["future_prices"]

            # Check if there is any null value
            if np.isnan(feature_list).any() == True:
                continue

            features.append(feature_list)
            symbol_names.append(symbol)
            historical_price_info.append(current_prices)
            future_price_info.append(future_prices)

        # Sometimes, there are some errors in feature generation or price extraction, let us remove that stuff
        features, historical_price_info, future_price_info, symbol_names = self.remove_bad_data(
            features, historical_price_info, future_price_info, symbol_names)

        return features, historical_price_info, future_price_info, symbol_names

    def remove_bad_data(self, features, historical_price_info,
                        future_price_info, symbol_names):
        """
		Remove bad data i.e data that had some errors while scraping or feature generation
		"""
        length_dictionary = collections.Counter(
            [len(feature) for feature in features])
        length_dictionary = list(length_dictionary.keys())
        most_common_length = length_dictionary[0]

        filtered_features, filtered_historical_price, filtered_future_prices, filtered_symbols = [], [], [], []
        for i in range(0, len(features)):
            if len(features[i]) == most_common_length:
                filtered_features.append(features[i])
                filtered_symbols.append(symbol_names[i])
                filtered_historical_price.append(historical_price_info[i])
                filtered_future_prices.append(future_price_info[i])

        return filtered_features, filtered_historical_price, filtered_future_prices, filtered_symbols