def performFeatureSelection(maxlag): import functions import datetime target = 'CLASSIFICATION' lags = range(2, maxlag) print 'Maximum time lag applied', max(lags) print '' for maxdelta in range(3,12): #datasets = functions.loadDatasets('/home/francesco/Dropbox/DSR/StocksProject/longdatasets') #start = datetime.datetime(1990, 1, 1) #end = datetime.datetime(2014, 8, 31) #out = functions.getStock('AAPL', start, end) datasets = functions.loadDatasets('/home/francesco/Dropbox/DSR/StocksProject/longdatasets') #datasets.insert(0, out) delta = range(2,maxdelta) print 'Delta days accounted: ', max(delta) for dataset in datasets: columns = dataset.columns adjclose = columns[-2] returns = columns[-1] for n in delta: functions.addFeatures(dataset, adjclose, returns, n) #dataset = dataset.iloc[max(delta):,:] finance = functions.mergeDataframes(datasets, 6, target) #finance = finance.ix[max(delta):] print 'Size of data frame: ', finance.shape print 'Number of NaN after merging: ', functions.count_missing(finance) finance = finance.interpolate(method='time') print 'Number of NaN after time interpolation: ', functions.count_missing(finance) finance = finance.fillna(finance.mean()) print 'Number of NaN after mean interpolation: ', functions.count_missing(finance) back = -1 finance.Return_Out = finance.Return_Out.shift(back) finance = functions.applyTimeLag(finance, lags, delta, back, target) print 'Number of NaN after temporal shifting: ', functions.count_missing(finance) print 'Size of data frame after feature creation: ', finance.shape if target == 'CLASSIFICATION': start_test = datetime.datetime(2014,4,1) X_train, y_train, X_test, y_test = functions.prepareDataForClassification(finance, start_test) acc = functions.performCV(X_train, y_train, 10, 'GTB', []) print '' print 'Mean Accuracy for (%d, %d): %f' % (max(lags), max(delta), acc) #print functions.performClassification(X, y, X_val, y_val, 'ADA', [100, 1]) print '============================================================================'
def final(): target = "CLASSIFICATION" lags = range(2, 3) print "Maximum time lag applied", max(lags) start = datetime.datetime(1990, 1, 1) end = datetime.datetime(2014, 8, 31) out = functions.getStock("GE", start, end) datasets = functions.loadDatasets("/home/francesco/Dropbox/DSR/StocksProject/longdatasets") datasets.insert(0, out) delta = range(2, 5) print "Max Delta days accounted: ", max(delta) for dataset in datasets: columns = dataset.columns adjclose = columns[-2] returns = columns[-1] for n in delta: functions.addFeatures(dataset, adjclose, returns, n) # dataset = dataset.iloc[max(delta):,:] finance = functions.mergeDataframes(datasets, 6, target) # finance = finance.ix[max(delta):] print "Size of data frame: ", finance.shape print "Number of NaN after merging: ", functions.count_missing(finance) print "% of NaN after merging: ", ( functions.count_missing(finance) / float(finance.shape[0] * finance.shape[1]) ) * 100, "%" finance = finance.interpolate(method="time") print "Number of NaN after time interpolation: ", functions.count_missing(finance) finance = finance.fillna(finance.mean()) print "Number of NaN after mean interpolation: ", functions.count_missing(finance) back = -1 # finance.Return_SP500 = finance.Return_SP500.shift(back) finance.Return_Out = finance.Return_Out.shift(back) finance = functions.applyTimeLag(finance, lags, delta, back, target) # finance = functions.mergeSentimenToStocks(finance) # print finance.columns print "Number of NaN after temporal shifting: ", functions.count_missing(finance) print "Size of data frame after feature creation: ", finance.shape if target == "CLASSIFICATION": start_test = datetime.datetime(2014, 4, 1) X_train, y_train, X_test, y_test = functions.prepareDataForClassification(finance, start_test) print "" # print 'Performing CV...' # grid = {'n_estimators': [80, 100, 150], 'learning_rate': [0.01, 0.1, 1, 10]} # grid = {'n_estimators': [50, 80, 100, 1000]} # functions.performTimeSeriesSearchGrid(finance, 4, 0.8, features, 'ADA', grid) print functions.performClassification(X_train, y_train, X_test, y_test, "RF", [])
def performFeatureSelection(maxlag): import functions import datetime target = 'CLASSIFICATION' lags = range(2, maxlag) print 'Maximum time lag applied', max(lags) print '' for maxdelta in range(3, 12): #datasets = functions.loadDatasets('/home/francesco/Dropbox/DSR/StocksProject/longdatasets') #start = datetime.datetime(1990, 1, 1) #end = datetime.datetime(2014, 8, 31) #out = functions.getStock('AAPL', start, end) datasets = functions.loadDatasets( '/home/francesco/Dropbox/DSR/StocksProject/longdatasets') #datasets.insert(0, out) delta = range(2, maxdelta) print 'Delta days accounted: ', max(delta) for dataset in datasets: columns = dataset.columns adjclose = columns[-2] returns = columns[-1] for n in delta: functions.addFeatures(dataset, adjclose, returns, n) #dataset = dataset.iloc[max(delta):,:] finance = functions.mergeDataframes(datasets, 6, target) #finance = finance.ix[max(delta):] print 'Size of data frame: ', finance.shape print 'Number of NaN after merging: ', functions.count_missing(finance) finance = finance.interpolate(method='time') print 'Number of NaN after time interpolation: ', functions.count_missing( finance) finance = finance.fillna(finance.mean()) print 'Number of NaN after mean interpolation: ', functions.count_missing( finance) back = -1 finance.Return_Out = finance.Return_Out.shift(back) finance = functions.applyTimeLag(finance, lags, delta, back, target) print 'Number of NaN after temporal shifting: ', functions.count_missing( finance) print 'Size of data frame after feature creation: ', finance.shape if target == 'CLASSIFICATION': start_test = datetime.datetime(2014, 4, 1) X_train, y_train, X_test, y_test = functions.prepareDataForClassification( finance, start_test) acc = functions.performCV(X_train, y_train, 10, 'GTB', []) print '' print 'Mean Accuracy for (%d, %d): %f' % (max(lags), max(delta), acc) #print functions.performClassification(X, y, X_val, y_val, 'ADA', [100, 1]) print '============================================================================'
def final(): target = 'CLASSIFICATION' lags = range(2, 3) print 'Maximum time lag applied', max(lags) start = datetime.datetime(1990, 1, 1) end = datetime.datetime(2014, 8, 31) out = functions.getStock('GE', start, end) datasets = functions.loadDatasets( '/home/francesco/Dropbox/DSR/StocksProject/longdatasets') datasets.insert(0, out) delta = range(2, 5) print 'Max Delta days accounted: ', max(delta) for dataset in datasets: columns = dataset.columns adjclose = columns[-2] returns = columns[-1] for n in delta: functions.addFeatures(dataset, adjclose, returns, n) #dataset = dataset.iloc[max(delta):,:] finance = functions.mergeDataframes(datasets, 6, target) #finance = finance.ix[max(delta):] print 'Size of data frame: ', finance.shape print 'Number of NaN after merging: ', functions.count_missing(finance) print '% of NaN after merging: ', ( functions.count_missing(finance) / float(finance.shape[0] * finance.shape[1])) * 100, '%' finance = finance.interpolate(method='time') print 'Number of NaN after time interpolation: ', functions.count_missing( finance) finance = finance.fillna(finance.mean()) print 'Number of NaN after mean interpolation: ', functions.count_missing( finance) back = -1 #finance.Return_SP500 = finance.Return_SP500.shift(back) finance.Return_Out = finance.Return_Out.shift(back) finance = functions.applyTimeLag(finance, lags, delta, back, target) #finance = functions.mergeSentimenToStocks(finance) #print finance.columns print 'Number of NaN after temporal shifting: ', functions.count_missing( finance) print 'Size of data frame after feature creation: ', finance.shape if target == 'CLASSIFICATION': start_test = datetime.datetime(2014, 4, 1) X_train, y_train, X_test, y_test = functions.prepareDataForClassification( finance, start_test) print '' #print 'Performing CV...' #grid = {'n_estimators': [80, 100, 150], 'learning_rate': [0.01, 0.1, 1, 10]} #grid = {'n_estimators': [50, 80, 100, 1000]} #functions.performTimeSeriesSearchGrid(finance, 4, 0.8, features, 'ADA', grid) print functions.performClassification(X_train, y_train, X_test, y_test, 'RF', [])