def getPreprocessedData(dataName, dataFrequency, autoConfigFileRelativePath, KEY_preProcessedDataFilePath): import pandas as pd import numpy as np from utilities.fileFolderManipulations import getJupyterRootDirectory from config.environment import getAppConfigData # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath) # using various python commands like os.path.abspath and os.path.join jupyterNodePath = None configFilePath = None # holds data from input data file - Truth source, should be usd only for reference and no updates should happen to this variable inputRawProcessedDataDF = None #caluclate the deployment directory path of the current juypter node in the operating system jupyterNodePath = getJupyterRootDirectory() print("jupyterNodePath >>> " + jupyterNodePath) configFilePath = jupyterNodePath + autoConfigFileRelativePath print("configFilePath >>> " + configFilePath) autoConfigData = getAppConfigData() preProcessedDataFilePath = autoConfigData[dataName][dataFrequency][ KEY_preProcessedDataFilePath] # read the raw processed data from csv file inputRawProcessedDataDF = pd.read_csv(preProcessedDataFilePath) return inputRawProcessedDataDF
def getQuantityBasedFeatures(dataName, dataFrequency): import pandas as pd import numpy as np from utilities.fileFolderManipulations import getJupyterRootDirectory from config.environment import getAppConfigData # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath) # using various python commands like os.path.abspath and os.path.join jupyterNodePath = getJupyterRootDirectory() autoConfigData = getAppConfigData() preProcessedDataFilePath = autoConfigData[dataName][dataFrequency][ 'preProcessedDataFilePath'] # read the raw processed data from csv file df = pd.read_csv(jupyterNodePath + preProcessedDataFilePath) qtyMean = np.mean(df['quantity']) qtyMax = np.max(df['quantity']) normalizedQuantityDf = (df['quantity'] - qtyMean) / qtyMax qtyDiffDf = df['quantity'] - df['quantity'].shift(1) qtyDiffMean = np.mean(qtyDiffDf) qtyDiffMax = np.max(qtyDiffDf) normalizedQtyDiffDf = (qtyDiffDf - qtyDiffMean) / qtyDiffMax return pd.concat([normalizedQuantityDf, normalizedQtyDiffDf], axis=1)
def getAppConfigData(): import json from utilities.fileFolderManipulations import getJupyterRootDirectory data=None try: projectRootDirectory = getJupyterRootDirectory() configFilePath = projectRootDirectory + "/src/config/config.json" print(' retrieving values configured in >>> ' + configFilePath) with open(configFilePath) as json_data_file: data = json.load(json_data_file) except : print(' error retrieving values configured in >>> ' + configFilePath) print(' creating new configuration file >>> ' + configFilePath) data = {} f = open(configFilePath, 'a+') # open file in append mode f.write('{}') f.close() finally: return data
def setAppConfigData(data): import json import sys,traceback from utilities.fileFolderManipulations import getJupyterRootDirectory returnValue = False data_string = '' try: projectRootDirectory = getJupyterRootDirectory() configFilePath = projectRootDirectory + "/src/config/config.json" print(' updating config file >>> ' + configFilePath) data_string = json.dumps(data) with open(configFilePath,'a+') as json_data_file: json_data_file.seek(0) json_data_file.write('') json_data_file.truncate() json_data_file.write(data_string) print(' successfully updated config file >>> (try block) ' + configFilePath + ' with data >>>' + data_string) returnValue = True except FileNotFoundError: print('creating and updating config file') f = open(configFilePath, 'a+') # open file in append mode f.write(data_string) f.close() print(' successfully created config file >>> (except block)' + configFilePath + ' with data >>>' + data_string) returnValue = True except: print("Error executing method >>> ") # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print("Unexpected error:", sys.exc_info()) # print(exc_type, fname, exc_tb.tb_lineno) # http://docs.python.org/2/library/sys.html#sys.exc_info exc_type, exc_value, exc_traceback = sys.exc_info() # most recent (if any) by default ''' Reason this _can_ be bad: If an (unhandled) exception happens AFTER this, or if we do not delete the labels on (not much) older versions of Py, the reference we created can linger. traceback.format_exc/print_exc do this very thing, BUT note this creates a temp scope within the function. ''' traceback_details = { 'filename': exc_traceback.tb_frame.f_code.co_filename, 'lineno' : exc_traceback.tb_lineno, 'name' : exc_traceback.tb_frame.f_code.co_name, 'type' : exc_type.__name__, 'message' : traceback.extract_tb(exc_traceback) } del(exc_type, exc_value, exc_traceback) # So we don't leave our local labels/objects dangling # This still isn't "completely safe", though! # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2] print print(traceback.format_exc()) print print(traceback_template % traceback_details) print #traceback.print_exception() raise finally: return returnValue
def preProcessData(dataName, dataFrequency, outputFileName="processedRawData.csv"): import os, sys, traceback import pandas as pd import glob from utilities.fileFolderManipulations import getParentFolder from utilities.fileFolderManipulations import createFolder from utilities.fileFolderManipulations import getJupyterRootDirectory from config.environment import getAppConfigData from config.environment import setAppConfigData from fastai.tabular import add_datepart print(' data pre-processing >> imported dependencies') relativeDataFolderPath = 'data/' + dataName + '/raw/' + dataFrequency # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath) # using various python commands like os.path.abspath and os.path.join jupyterNodePath = None # Variable to hold a dataframe created with the data from input data files in the relativeDataFolderPath provided inputRawDataDF = None # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath) # using various python commands like os.path.abspath and os.path.join dataFolderPath = None # Variable to hold query like value of python to query all json file names in the source folder (dataFolderPath). # Will be used in the glob function to execute the query json_pattern = None # Variable to contain the list of all input json file names in the source folder (dataFolderPath) file_list = None # return values of this method # ------------------------------------------------------------------------------- # Current methods return value initialized to false. Will be maked as true # after every single line in the method has been executed with out errors returnValue = False # complete filepath of the csv file with the processed raw data outputFilePath = None outputFolderName = None # ------------------------------------------------------------------------------- try: #caluclate the deployment directory path of the current juypter node in the operating system jupyterNodePath = getJupyterRootDirectory() # TO BE MODIFIED - NOT SURE WHY I USED THIS - WILL HAVE TO CHECK pd.set_option('display.max_columns', None) # creating pandas dataframe references for further modification inputRawDataDF = pd.DataFrame() #calculating the complete data folder path of the relative path provided as parameter dataFolderPath = jupyterNodePath + '/' + relativeDataFolderPath # creating OS queryable object for python to work with to find json files in the dataFolderPath calcuated in the previous step json_pattern = os.path.join(dataFolderPath, '*.json') # store all the json file paths in the dataFolderPath for further processing file_list = glob.glob(json_pattern) # execution assertion/ui progress update info print('looping through all the files to create input data') # loop through all the files in the folder and create inputRawDataDF pandas datafram for file in file_list: print("reading input file >>> " + file + " ...") data = pd.read_json(file, lines=True) if isinstance(data, str): data = data['data'][0]['candles'] else: data = data.values[0][0]['candles'] inputRawDataDF = inputRawDataDF.append(data, ignore_index=True) print("File read - SUCCESS") inputRawDataDF.columns = [ 'date-time', 'open', 'high', 'low', 'close', 'quantity', 'dont-know' ] buffer = inputRawDataDF['date-time'] add_datepart(inputRawDataDF, 'date-time') inputRawDataDF = pd.concat([buffer, inputRawDataDF], axis=1) #create prior_holidays feature priorHolidaysStamps = getPriorHoliDaysStamps( inputRawDataDF['date-timeDayofyear']) priorHolidaysStamps_df = pd.DataFrame( {'prior_holidays': priorHolidaysStamps[:]}) inputRawDataDF = pd.concat([inputRawDataDF, priorHolidaysStamps_df], axis=1) print('added prior_holidays feature in pre-processed data') #create following_holidays feature followingHolidaysStamps = getFollowingHolidaysDaysStamp( inputRawDataDF['date-timeDayofyear']) followingHolidaysStamps_df = pd.DataFrame( {'following_holidays': followingHolidaysStamps[:]}) inputRawDataDF = pd.concat( [inputRawDataDF, followingHolidaysStamps_df], axis=1) print('added following_holidays feature in pre-processed data') ''' w write mode r read mode a append mode w+ create file if it doesn't exist and open it in (over)write mode [it overwrites the file if it already exists] r+ open an existing file in read+write mode a+ create file if it doesn't exist and open it in append mode ''' processFolderName = getParentFolder(dataFolderPath, 2) + '/processed/' + dataFrequency print('Attempting to create folder if it does not exist >>>' + processFolderName) createFolder(processFolderName) outputFolderName = processFolderName + '/preProcessedData' print('Attempting to create folder if it does not exist >>>' + outputFolderName) createFolder(outputFolderName) outputFilePath = outputFolderName + '/' + outputFileName print('Attempting to create/update file >>>' + outputFilePath) #f = open(output_file_name, 'w+') # open file in append mode #f.write('') #f.close() #np.savetxt(output_file_name, inputRawDataDF, delimiter=",") inputRawDataDF.to_csv(outputFilePath, sep=',', index=False) print( 'created raw easy to use csv data to be used for preparing training data in the location >>>' + outputFilePath) print(' creating/updating autoConfig file') configFilePath = jupyterNodePath + '/src/config/autoConfig/config.json' autoConfigData = getAppConfigData() if not autoConfigData.get(dataName): autoConfigData[dataName] = {} if not autoConfigData[dataName].get(dataFrequency): autoConfigData[dataName][dataFrequency] = {} autoConfigData[dataName][dataFrequency] = { 'preProcessedDataFilePath': outputFilePath.replace(jupyterNodePath, '') } setAppConfigData(autoConfigData) print(' creating/updating autoConfig file >>>' + configFilePath) returnValue = True except: print("Error executing method >>> ") # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print("Unexpected error:", sys.exc_info()) # print(exc_type, fname, exc_tb.tb_lineno) # http://docs.python.org/2/library/sys.html#sys.exc_info exc_type, exc_value, exc_traceback = sys.exc_info( ) # most recent (if any) by default ''' Reason this _can_ be bad: If an (unhandled) exception happens AFTER this, or if we do not delete the labels on (not much) older versions of Py, the reference we created can linger. traceback.format_exc/print_exc do this very thing, BUT note this creates a temp scope within the function. ''' traceback_details = { 'filename': exc_traceback.tb_frame.f_code.co_filename, 'lineno': exc_traceback.tb_lineno, 'name': exc_traceback.tb_frame.f_code.co_name, 'type': exc_type.__name__, 'message': traceback.extract_tb(exc_traceback) } del (exc_type, exc_value, exc_traceback ) # So we don't leave our local labels/objects dangling # This still isn't "completely safe", though! # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2] print print(traceback.format_exc()) print print(traceback_template % traceback_details) print #traceback.print_exception() raise finally: return [ returnValue, outputFolderName, outputFileName, outputFilePath, inputRawDataDF ]
def getRedGreenCandlesCatogizedBySizeDf(df, dataName, dataFrequency, boundaryValues=None): # @Param :: boundaryValues # - should be Array of 5 tuples # - each tuble must be a pair of negative and positive float compatible values only (-0.44,0.44) # - absolute value of each elements in the tuple should be less than the previous corresponding previous entry # example [(-3.44,2.44),(-3.32,1.37),(-1.11,1.01),(-0.53,0.76),(-0.02,0.019)] import os, sys, traceback import json import pandas as pd import numpy as np from utilities.fileFolderManipulations import getJupyterRootDirectory from config.environment import getAppConfigData from config.environment import setAppConfigData redCandlesBySizeDf = None greenCandlesBySizeDf = None redCandlesBySizeTimesMagnitudeDf = None greenCandlesBySizeTimesMagnitudeDf = None redGreenCandlesTanhDf = None redGreenCandlesTanhTimesMagnitudeDf = None try: if boundaryValues is None: print('boundary values is none') #caluclate the deployment directory path of the current juypter node in the operating system jupyterNodePath = getJupyterRootDirectory() print("jupyterNodePath >>> " + jupyterNodePath) autoConfigData = getAppConfigData() if not autoConfigData.get(dataName): autoConfigData[dataName] = {} if not autoConfigData[dataName].get(dataFrequency): autoConfigData[dataName][dataFrequency] = {} boundaryValues = autoConfigData[dataName][dataFrequency].get( 'redGreenCandleSizeBoundaries') if boundaryValues is None or (type(boundaryValues) == 'str' and boundaryValues.strip() == ''): print('boundary values is not configured') closeOpenDiffDf = (df['close'] - df['open']).rename('close_open_diff') candlesByBodyLengthDf = closeOpenDiffDf.sort_values( axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last').reset_index(drop=True) sortedRedCandles = candlesByBodyLengthDf.loc[ candlesByBodyLengthDf[0:] < 0].reset_index(drop=True) candlesByBodyLengthDf = closeOpenDiffDf.sort_values( axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').reset_index(drop=True) sortedGreenCandles = candlesByBodyLengthDf.loc[ candlesByBodyLengthDf[0:] > 0].reset_index(drop=True) interval = np.arange(.2, 1, .2) indexArr = [ ((int)(sortedRedCandles.shape[0] * interval[itr]), (int)(sortedGreenCandles.shape[0] * interval[itr])) for itr in range(0, interval.size) ] boundaryValues = [(sortedRedCandles[indexItr[0] - 1], sortedGreenCandles[indexItr[1] - 1]) for indexItr in indexArr] autoConfigData[dataName][dataFrequency].update( {'redGreenCandleSizeBoundaries': boundaryValues}) #print('pushing values to autoConfigFile >>> ' + configFilePath + ' with data '+ json.dumps(autoConfigData)) setAppConfigData(autoConfigData) else: print( 'using configured boundary values - do not update configurations unless u r absolutely sure of it' ) else: print('using boundary values provided as parameter') dataMagnitudeDf = np.divide( np.sqrt( np.sum(np.square(df[['open', 'close', 'high', 'low']]), axis=1)), 4) redCandlesBySizeDf = (df['close'] - df['open']).rename('redCandlesBySize') redCandlesBySizeDf[redCandlesBySizeDf >= 0] = 0 redCandlesBySizeDf[redCandlesBySizeDf < boundaryValues[0][0]] = 5 redCandlesBySizeDf[redCandlesBySizeDf.between(boundaryValues[0][0], boundaryValues[1][0], inclusive=True)] = 4 redCandlesBySizeDf[redCandlesBySizeDf.between(boundaryValues[1][0], boundaryValues[2][0], inclusive=False)] = 3 redCandlesBySizeDf[redCandlesBySizeDf.between(boundaryValues[2][0], boundaryValues[3][0], inclusive=True)] = 2 redCandlesBySizeDf[redCandlesBySizeDf.between(boundaryValues[3][0], 0, inclusive=False)] = 1 greenCandlesBySizeDf = (df['close'] - df['open']).rename('greenCandlesBySize') greenCandlesBySizeDf[greenCandlesBySizeDf <= 0] = 0 greenCandlesBySizeDf[greenCandlesBySizeDf > boundaryValues[0][1]] = 5 greenCandlesBySizeDf[greenCandlesBySizeDf.between(boundaryValues[1][1], boundaryValues[0][1], inclusive=True)] = 4 greenCandlesBySizeDf[greenCandlesBySizeDf.between(boundaryValues[2][1], boundaryValues[1][1], inclusive=False)] = 3 greenCandlesBySizeDf[greenCandlesBySizeDf.between(boundaryValues[3][1], boundaryValues[2][1], inclusive=True)] = 2 greenCandlesBySizeDf[greenCandlesBySizeDf.between(0, boundaryValues[3][1], inclusive=False)] = 1 redGreenCandlesTanhDf = (df['close'] - df['open']).rename('redGreenCandlesTanh') redGreenCandlesTanhDf[redGreenCandlesTanhDf < 0] = -1 redGreenCandlesTanhDf[redGreenCandlesTanhDf > 0] = 1 redGreenCandlesTanhTimesMagnitudeDf = np.multiply( redGreenCandlesTanhDf, dataMagnitudeDf) redCandlesBySizeTimesMagnitudeDf = -np.multiply( redCandlesBySizeDf, dataMagnitudeDf) greenCandlesBySizeTimesMagnitudeDf = np.multiply( greenCandlesBySizeDf, dataMagnitudeDf) redCandlesBySizeTimesMagnitudeDf = redCandlesBySizeTimesMagnitudeDf.rename( 'redCandlesBySizeTimesMagnitude') greenCandlesBySizeTimesMagnitudeDf = greenCandlesBySizeTimesMagnitudeDf.rename( 'greenCandlesBySizeTimesMagnitude') except: print("Error executing method >>> ") # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print("Unexpected error:", sys.exc_info()) # print(exc_type, fname, exc_tb.tb_lineno) # http://docs.python.org/2/library/sys.html#sys.exc_info exc_type, exc_value, exc_traceback = sys.exc_info( ) # most recent (if any) by default ''' Reason this _can_ be bad: If an (unhandled) exception happens AFTER this, or if we do not delete the labels on (not much) older versions of Py, the reference we created can linger. traceback.format_exc/print_exc do this very thing, BUT note this creates a temp scope within the function. ''' traceback_details = { 'filename': exc_traceback.tb_frame.f_code.co_filename, 'lineno': exc_traceback.tb_lineno, 'name': exc_traceback.tb_frame.f_code.co_name, 'type': exc_type.__name__, 'message': traceback.extract_tb(exc_traceback) } del (exc_type, exc_value, exc_traceback ) # So we don't leave our local labels/objects dangling # This still isn't "completely safe", though! # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2] print print(traceback.format_exc()) print print(traceback_template % traceback_details) print #traceback.print_exception() raise finally: return pd.concat([ redCandlesBySizeDf, greenCandlesBySizeDf, redCandlesBySizeTimesMagnitudeDf, greenCandlesBySizeTimesMagnitudeDf, redGreenCandlesTanhDf, redGreenCandlesTanhTimesMagnitudeDf.rename( 'redGreenCandlesTanhTimesMagnitudeDf') ], axis=1)
def doBasicOperation(dataName, dataFrequency): import os, sys, traceback from datetime import datetime, timedelta import pandas as pd import numpy as np from config.environment import getAppConfigData from config.environment import setAppConfigData from utilities.fileFolderManipulations import getJupyterRootDirectory from utilities.fileFolderManipulations import getParentFolder from utilities.fileFolderManipulations import createFolder print("into method doBasicOperation") return_fundamentalFeaturesDf = None try: # Variable to hold the original source folder path which is calculated from the input relative path of the source folder (relativeDataFolderPath) # using various python commands like os.path.abspath and os.path.join jupyterNodePath = getJupyterRootDirectory() configFilePath = None # holds data from input data file - Truth source, should be usd only for reference and no updates should happen to this variable inputRawProcessedDataDF = None autoConfigData = getAppConfigData() preProcessedDataFilePath = autoConfigData[dataName][dataFrequency][ 'preProcessedDataFilePath'] # read the raw processed data from csv file inputRawProcessedDataDF = pd.read_csv(jupyterNodePath + preProcessedDataFilePath) return_fundamentalFeaturesDf = createFundamentalFeatures( inputRawProcessedDataDF) print("before return statement of method doBasicOperation ") except: print("Error executing method >>> ") # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print("Unexpected error:", sys.exc_info()) # print(exc_type, fname, exc_tb.tb_lineno) # http://docs.python.org/2/library/sys.html#sys.exc_info exc_type, exc_value, exc_traceback = sys.exc_info( ) # most recent (if any) by default ''' Reason this _can_ be bad: If an (unhandled) exception happens AFTER this, or if we do not delete the labels on (not much) older versions of Py, the reference we created can linger. traceback.format_exc/print_exc do this very thing, BUT note this creates a temp scope within the function. ''' traceback_details = { 'filename': exc_traceback.tb_frame.f_code.co_filename, 'lineno': exc_traceback.tb_lineno, 'name': exc_traceback.tb_frame.f_code.co_name, 'type': exc_type.__name__, 'message': traceback.extract_tb(exc_traceback) } del (exc_type, exc_value, exc_traceback ) # So we don't leave our local labels/objects dangling # This still isn't "completely safe", though! # "Best (recommended) practice: replace all exc_type, exc_value, exc_traceback # with sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2] print print(traceback.format_exc()) print print(traceback_template % traceback_details) print #traceback.print_exception() raise finally: return return_fundamentalFeaturesDf
def createFinalTrainingFeatureList(dataName, dataFrequency, variation_degree=-1): import glob import pandas as pd from utilities.fileFolderManipulations import getJupyterRootDirectory from utilities.fileFolderManipulations import getParentFolder from config.environment import getAppConfigData from config.environment import setAppConfigData from dataPreparation.featurePreparation import doBasicOperation configData = getAppConfigData() projectRootFolderPath = getJupyterRootDirectory() if not isinstance(variation_degree, int) or variation_degree == -1: variation_degree = configData['variationDegreeForFeatureGeneration'] _basicDf = doBasicOperation(dataName, dataFrequency) filteredFeaturesPath = "/data/" + dataName + "/processed/" + dataFrequency + "/features/filteredFeatures" outputFinalFeatureListFilePath = "/data/" + dataName + "/processed/" + dataFrequency + "/features/finalTrainingFeatureList.csv" print("filteredFeaturesFolderPath >>> " + filteredFeaturesPath) # creating OS queryable object for python to work with to find json files in the dataFolderPath calcuated in the previous step csv_pattern = os.path.join( projectRootFolderPath + '/' + filteredFeaturesPath, '*.csv') print("declared csv_pattern") # store all the json file paths in the dataFolderPath for further processing file_list = glob.glob(csv_pattern) print("obtained file_list") # creating pandas dataframe references for further modification trainingFeatureDF = _basicDf print('initialized trainingFeatureDF') # execution assertion/ui progress update info print('looping through all the files to create input data') #loop through all the files in the folder and create inputRawDataDF pandas datafram for file in file_list: print("reading input file >>> " + file + " ...") data = pd.read_csv(file) #data=data.values[0][0]['candles'] trainingFeatureDF = pd.concat( [trainingFeatureDF, data], axis=1) #trainingFeatureDF.append(data, ignore_index = True) print("File read - SUCCESS") # crate the final training list file print("creating finalTrainingFeatureList in location >>> " + outputFinalFeatureListFilePath) trainingFeatureDF.to_csv(projectRootFolderPath + '/' + outputFinalFeatureListFilePath) # update auto config file autoConfigData = getAppConfigData() autoConfigData[dataName][dataFrequency].update( {'finalTrainingFeaturesListFile': outputFinalFeatureListFilePath}) setAppConfigData(autoConfigData) print("updated config file with data >>>> finalTrainingFeaturesListFile:" + outputFinalFeatureListFilePath) return trainingFeatureDF, outputFinalFeatureListFilePath