def UpdateHDF5(symbol_directory, symbols_file): ## ## Update symbols in 'symbols_file' with quotes more recent than last update. ## filename = os.path.join(symbol_directory, symbols_file) x, symbols, datearray, quote, listname = loadQuotes_fromHDF(filename) # get last date in hdf5 archive #from datetime import datetime import datetime date = quote.index lastdate = getLastDateFromHDF5(symbol_directory, symbols_file) ## ## Get quotes for each symbol in list ## process dates. ## Clean up quotes. ## Make a plot showing all symbols in list ## # locate symbols added to list that aren't in HDF5 file symbols_in_list = readSymbolList(filename, verbose=False) symbols_in_HDF5 = list(quote.columns.values) new_symbols = [x for x in symbols_in_list if x not in symbols_in_HDF5] # write new symbols to temporary file if len(new_symbols) > 0: # write new symbols to temporary file tempfilename = os.path.join(symbol_directory, "newsymbols_tempfile.txt") OUTFILE = open(tempfilename, "w", 0) for i, isymbol in enumerate(new_symbols): print "new symbol = ", isymbol OUTFILE.write(str(isymbol) + "\n") newquotesfirstdate = datetime.date(1991, 1, 1) newquoteslastdate = datetime.date.today() # print dates to be used print "dates for new symbol found = ", newquotesfirstdate, newquoteslastdate newadjClose, newsymbols, newdatearray = arrayFromQuotesForList( tempfilename, newquotesfirstdate, newquoteslastdate) print " security values check: ", newadjClose[isnan(newadjClose)].shape newdates = [] for i in range(newdatearray.shape[0]): newdates.append(str(newdatearray[i])) #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float) quotes_NewSymbols = pd.DataFrame(newadjClose.swapaxes(0, 1), index=newdates, columns=newsymbols) ## ## Get quotes for each symbol in list ## process dates. ## Clean up quotes. ## Make a plot showing all symbols in list ## if type(lastdate) == str: newquotesfirstdate = datetime.date( *[int(val) for val in lastdate.split('-')]) else: newquotesfirstdate = lastdate today = datetime.datetime.now() tomorrow = today + timedelta(days=1) newquoteslastdate = tomorrow newadjClose, symbols, newdatearray = arrayFromQuotesForList( filename, newquotesfirstdate, newquoteslastdate) print " security values check: ", newadjClose[isnan(newadjClose)].shape newdates = [] for i in range(newdatearray.shape[0]): newdates.append(str(newdatearray[i])) quoteupdate = pd.DataFrame(newadjClose.swapaxes(0, 1), index=newdates, columns=symbols) updatedquotes = quoteupdate.combine_first(quote) ################### from functions.TAfunctions import cleanspikes from functions.TAfunctions import interpolate from functions.TAfunctions import cleantobeginning # clean up quotes for missing values and varying starting date #x = quote.as_matrix().swapaxes(0,1) xupdate = updatedquotes.values.T symbolListupdate = list(updatedquotes.columns.values) # Clean up input quotes # - infill interior NaN values using nearest good values to linearly interpolate # - copy first valid quote to from valid date to all earlier positions #for ii in range(x.shape[0]): for ii, isymbolupdate in enumerate(symbolListupdate): ''' if ii%5 == 0: print " ... progress: ii, symbol = ", ii, isymbolupdate ''' xupdate = updatedquotes[isymbolupdate].values xupdate = cleanspikes(xupdate) xupdate = interpolate(xupdate) xupdate = cleantobeginning(xupdate) updatedquotes[isymbolupdate] = xupdate ################### if len(new_symbols) > 0: print "\n\n\n...quotes_NewSymbols = ", quotes_NewSymbols.info() print "\n\n\n...updatedquotes = ", updatedquotes.info() for isymbol in new_symbols: updatedquotes[isymbol] = quotes_NewSymbols[isymbol] print "\n\n\n...merged updatedquotes = ", updatedquotes.info() CASHadjClose = np.ones((len(updatedquotes.index)), float) * 100. for i in range(CASHadjClose.shape[0]): if i % 10 == 0: CASHadjClose[i] = CASHadjClose[i - 1] + .01 else: CASHadjClose[i] = CASHadjClose[i - 1] updatedquotes['CASH'] = CASHadjClose # set up to write quotes to disk. dirname = os.path.join(os.getcwd(), "symbols") hdf5filename = os.path.join(dirname, listname + "_.hdf5") print "hdf5 filename = ", hdf5filename updatedquotes.to_hdf(hdf5filename, listname, mode='a', format='table', append=False, complevel=5, complib='blosc') return
def cleanup_quotes(symbols_file, newquotesfirstdate, newquoteslastdate): # compare quotes currently on hdf with updated quotes from internet. print " ... inside compareHDF_and_newquotes ..." print " ... newquotesfirstdate = ", newquotesfirstdate print " ... newquoteslastdate = ", newquoteslastdate # get existing quotes from hdf (directory_name, file_name) = os.path.split(symbols_file) (shortname, extension) = os.path.splitext(file_name) print "file name for symbols = ", "_" + shortname + "_" print "file type for symbols = ", extension # set up to write quotes to disk. if shortname == "symbols": listname = "TAA-Symbols" elif shortname == "cmg_symbols": listname = "CMG-Symbols" elif shortname == "Naz100_Symbols": listname = "Naz100_Symbols" elif shortname == "biglist": listname = "biglist-Symbols" elif shortname == "ETF_symbols": listname = "ETF-Symbols" elif shortname == "ProvidentFundSymbols": listname = "ProvidentFund-Symbols" elif shortname == "sp500_symbols": listname = "SP500-Symbols" else: listname = shortname hdf5_directory = os.path.join(os.getcwd(), "symbols") hdf5filename = os.path.join(hdf5_directory, listname + "_.hdf5") print "" print "" print "symbol_directory = ", directory_name print "symbols_file = ", symbols_file print "shortname, extension = ", shortname, extension print "hdf5filename = ", hdf5filename dataframeFromHDF = pd.read_hdf(hdf5filename, listname) x_hdf = dataframeFromHDF.as_matrix() x_hdf = x_hdf.swapaxes(0, 1) date_hdf = dataframeFromHDF.index symbols_hdf = list(dataframeFromHDF.columns.values) # Clean up input quotes # - infill interior NaN values using nearest good values to linearly interpolate # - copy first valid quote to from valid date to all earlier positions #for ii in range(x.shape[0]): for ii, isymbolupdate in enumerate(symbols_hdf): xupdate = dataframeFromHDF[isymbolupdate] ''' if isymbolupdate == 'SBUX': import pdb pdb.set_trace() ''' print " ... cleanup_quotes ... symbol = ", isymbolupdate xupdate = cleanspikes(xupdate) xupdate = interpolate(xupdate) xupdate = cleantobeginning(xupdate) dataframeFromHDF[isymbolupdate] = xupdate.copy() #xupdate[ii,:] = np.array(xupdate[ii,:]).astype('float') #xupdate[ii,:] = interpolate(xupdate[ii,:]) #xupdate[ii,:] = cleantobeginning(xupdate[ii,:]) dataframeFromHDF.to_hdf(hdf5filename, listname, mode='a', format='table', append=False, complevel=5, complib='blosc') return
def compareHDF_and_newquotes(symbols_file, newquotesfirstdate, newquoteslastdate): # compare quotes currently on hdf with updated quotes from internet. print " ... inside compareHDF_and_newquotes ..." print " ... newquotesfirstdate = ", newquotesfirstdate print " ... newquoteslastdate = ", newquoteslastdate # get existing quotes from hdf (directory_name, file_name) = os.path.split(symbols_file) (shortname, extension) = os.path.splitext(file_name) print "file name for symbols = ", "_" + shortname + "_" print "file type for symbols = ", extension # set up to write quotes to disk. if shortname == "symbols": listname = "TAA-Symbols" elif shortname == "cmg_symbols": listname = "CMG-Symbols" elif shortname == "Naz100_Symbols": listname = "Naz100_Symbols" elif shortname == "biglist": listname = "biglist-Symbols" elif shortname == "ETF_symbols": listname = "ETF-Symbols" elif shortname == "ProvidentFundSymbols": listname = "ProvidentFund-Symbols" elif shortname == "sp500_symbols": listname = "SP500-Symbols" else: listname = shortname hdf5_directory = os.path.join(os.getcwd(), "symbols") hdf5filename = os.path.join(hdf5_directory, listname + "_.hdf5") print "" print "" print "symbol_directory = ", directory_name print "symbols_file = ", symbols_file print "shortname, extension = ", shortname, extension print "hdf5filename = ", hdf5filename dataframeFromHDF = pd.read_hdf(hdf5filename, listname) x_hdf = dataframeFromHDF.as_matrix() x_hdf = x_hdf.swapaxes(0, 1) date_hdf = dataframeFromHDF.index symbols_hdf = list(dataframeFromHDF.columns.values) # get new quotes dataframe from internet newadjClose, newsymbols, newdatearray = arrayFromQuotesForList( symbols_file, newquotesfirstdate, newquoteslastdate) print " security values check: ", newadjClose[isnan(newadjClose)].shape newdates = [] for i in range(newdatearray.shape[0]): newdates.append(str(newdatearray[i])) #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float) dataframeFromInternet = pd.DataFrame(newadjClose.swapaxes(0, 1), index=newdates, columns=newsymbols) ################### from functions.TAfunctions import interpolate from functions.TAfunctions import cleantobeginning # clean up quotes for missing values and varying starting date #x = quote.as_matrix().swapaxes(0,1) ##xupdate = dataframeFromInternet.values.T symbolListupdate = list(dataframeFromInternet.columns.values) # Clean up input quotes # - infill interior NaN values using nearest good values to linearly interpolate # - copy first valid quote to from valid date to all earlier positions #for ii in range(x.shape[0]): for ii, isymbolupdate in enumerate(symbolListupdate): xupdate = dataframeFromInternet[isymbolupdate] ''' if isymbolupdate == 'SBUX': import pdb pdb.set_trace() ''' print " isymbolupdate,xupdate = ", isymbolupdate, xupdate.as_matrix() xupdate = cleanspikes(xupdate) xupdate = interpolate(xupdate) xupdate = cleantobeginning(xupdate) #xupdate[ii,:] = np.array(xupdate[ii,:]).astype('float') #xupdate[ii,:] = interpolate(xupdate[ii,:]) #xupdate[ii,:] = cleantobeginning(xupdate[ii,:]) ################### x_net = dataframeFromInternet.as_matrix() x_net = x_net.swapaxes(0, 1) date_net = dataframeFromInternet.index symbols_net = list(dataframeFromInternet.columns.values) # find joined symbols symbols_all = symbols_hdf + symbols_net symbols_all = list(set(symbols_all)) symbols_all.sort() for isymbol in symbols_all: # find date range for shorter of quotes update from net or on hdf try: hdf_index = symbols_hdf.index(isymbol) firstindexup_hdf = np.argmax( np.clip(x_hdf[hdf_index, :] / x_hdf[hdf_index, 0], 1., 1. + 1.e-5)) firstindexdown_hdf = np.argmin( np.clip(x_hdf[hdf_index, :] / x_hdf[hdf_index, 0], 1. - 1.e-5, 1.)) firstindex_hdf = max(firstindexup_hdf, firstindexdown_hdf) net_index = symbols_net.index(isymbol) firstindexup_net = np.argmax( np.clip(x_net[net_index, :] / x_net[net_index, 0], 1., 1. + 1.e-5)) firstindexdown_net = np.argmin( np.clip(x_net[net_index, :] / x_net[net_index, 0], 1. - 1.e-5, 1.)) firstindex_net = max(firstindexup_net, firstindexdown_net) firstDate = max(date_net[firstindex_net], date_hdf[firstindex_hdf]) lastDate = min(date_net[-1], date_hdf[-1]) values_hdf = x_hdf[hdf_index, list(date_hdf).index(firstDate):list(date_hdf). index(lastDate) + 1] values_net = x_net[net_index, list(date_net).index(firstDate):list(date_net). index(lastDate) + 1] if False in values_hdf == values_net: print " ... **** symbol ", format( isymbol, '5s' ), " is different in hdf and update from internet (", firstDate, " to ", lastDate, " )" else: print " ... symbol ", format( isymbol, '5s' ), " is same in hdf and update from internet (", firstDate, " to ", lastDate, " )" except: print " ... **** **** symbol ", format( isymbol, '5s'), " not matched in hdf and update from internet" ''' if isymbol == 'SBUX': print " .... firstdate, lastdate = ", firstDate, lastDate datesForPlot = date_hdf[list(date_hdf).index(firstDate):list(date_hdf).index(lastDate)+1] _datesForPlot1=[] for i in range(len(datesForPlot)): datestr = datesForPlot[i] date_newformat = datetime.date(*[int(val) for val in datestr.split('-')]) #date_newformat = datestr _datesForPlot1.append(date_newformat) iindex = list(date_hdf).index(firstDate) + i #print "i,date_newformat,values_net['SBUX'] = ", i,date_newformat,x_net[net_index,iindex] print " .... _datesForPlot1 = ", _datesForPlot1 plt.figure() plt.grid() plt.plot(_datesForPlot1,values_hdf) plt.plot(_datesForPlot1,values_hdf,'b.') datesForPlot = date_net[list(date_net).index(firstDate):list(date_net).index(lastDate)+1] _datesForPlot2=[] for i in range(len(datesForPlot)): datestr = datesForPlot[i] date_newformat = datetime.date(*[int(val) for val in datestr.split('-')]) #date_newformat = datestr _datesForPlot2.append(date_newformat) print "\n\n\n .... _datesForPlot2 = ", _datesForPlot2 plt.plot(_datesForPlot2,values_net) plt.plot(_datesForPlot2,values_net,'g.') ''' return
def UpdateHDF_yf(symbol_directory, symbols_file): ## ## Update symbols in 'symbols_file' with quotes more recent than last update. ## - use yahoo_fix for pandas_datareader ## print(" ... inside UpdateHDF_yf ...") filename = os.path.join(symbol_directory, symbols_file) x, symbols, datearray, quote, listname = loadQuotes_fromHDF(filename) print(" ... inside UpdateHDF_yf ... finished loadQuotes_fromHDF") def _return_quotes_array(symbolsFile, start_date="2018-01-01", end_date=None): ### ### get quotes from yahoo_fix. return quotes, symbols, dates ### as numpy arrays ### import datetime from functions.readSymbols import readSymbolList from pandas_datareader import data as pdr import functions.fix_yahoo_finance as yf yf.pdr_override() # <== that's all it takes :-) # read symbols list symbols = readSymbolList(symbolsFile, verbose=True) if end_date == None: end_date = str(datetime.date.today()) #data = pdr.get_data_yahoo(symbols, start=start_date, end=end_date) data = get_quotes_yf(symbols, start_date=start_date, end_date=end_date) try: # for multiple symbols #symbolList = data['Adj Close'].columns symbolList = list(data.columns) except: # for single symbol symbolList = symbols #datearray = data['Adj Close'].index #x = data['Adj Close'].values datearray = data.index x = data.values newdates = [] for i in range(datearray.shape[0]): newdates.append(str(datearray[i]).split(' ')[0]) newdates = np.array(newdates) return x, symbolList, newdates # get last date in hdf5 archive #from datetime import datetime import datetime date = quote.index lastdate = getLastDateFromHDF5(symbol_directory, symbols_file) print(" ... inside UpdateHDF5 ... lastdate = ", lastdate) from time import sleep sleep(3) ## ## Get quotes for each symbol in list ## process dates. ## Clean up quotes. ## Make a plot showing all symbols in list ## # locate symbols added to list that aren't in HDF5 file symbols_in_list = readSymbolList(filename, verbose=False) symbols_in_HDF5 = list(quote.columns.values) new_symbols = [x for x in symbols_in_list if x not in symbols_in_HDF5] # write new symbols to temporary file if len(new_symbols) > 0: # write new symbols to temporary file tempfilename = os.path.join(symbol_directory, "newsymbols_tempfile.txt") OUTFILE = open(tempfilename, "w") for i, isymbol in enumerate(new_symbols): print("new symbol = ", isymbol) OUTFILE.write(str(isymbol) + "\n") OUTFILE.close() newquotesfirstdate = datetime.date(1991, 1, 1) newquoteslastdate = datetime.date.today() # print dates to be used print("dates for new symbol found = ", newquotesfirstdate, newquoteslastdate) print("newquotesfirstdate, newquoteslastdate = ", newquotesfirstdate, newquoteslastdate) #newadjClose, newsymbols, newdatearray = arrayFromQuotesForList(tempfilename, newquotesfirstdate, newquoteslastdate) newadjClose, newsymbols, newdatearray = _return_quotes_array( tempfilename, start_date=newquotesfirstdate, end_date=newquoteslastdate) if type(newdatearray) == list: newdatearray = np.array(newdatearray) print(" newadjClose.shape = ", newadjClose.shape) print(" len(newsymbols) = ", len(newsymbols)) print(" len(newdatearray) = ", len(newdatearray)) print(" security values check: ", newadjClose[np.isnan(newadjClose)].shape) newdates = [] for i in range(newdatearray.shape[0]): newdates.append(str(newdatearray[i])) #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float) print("newadjClose.shape = ", newadjClose.shape) print('newsymbols = ', newsymbols) print('newdatearray = ', newdatearray) if newadjClose.shape[1] == len(newdates): quotes_NewSymbols = pd.DataFrame(newadjClose.swapaxes(0, 1), index=newdates, columns=newsymbols) else: quotes_NewSymbols = pd.DataFrame(newadjClose, index=newdates, columns=newsymbols) """ if newadjClose.ndim > 1: quotes_NewSymbols = pd.DataFrame(newadjClose.swapaxes(0,1), index=newdates, columns=newsymbols) else: quotes_NewSymbols = pd.DataFrame(newadjClose, index=newdates, columns=newsymbols) """ ## ## Get quotes for each symbol in list ## process dates. ## Clean up quotes. ## Make a plot showing all symbols in list ## if type(lastdate) == str: newquotesfirstdate = datetime.date( *[int(val) for val in lastdate.split('-')]) else: newquotesfirstdate = lastdate today = datetime.datetime.now() tomorrow = today + timedelta(days=1) newquoteslastdate = tomorrow #newadjClose, symbols, newdatearray = arrayFromQuotesForList(filename, newquotesfirstdate, newquoteslastdate) newadjClose, symbols, newdatearray = _return_quotes_array( filename, start_date=newquotesfirstdate, end_date=newquoteslastdate) print(" ...inside UpdateSymbols_inHDF5... newadjClose.shape = ", newadjClose.shape) print(" ...inside UpdateSymbols_inHDF5... len(symbols) = ", len(symbols)) print(" ...inside UpdateSymbols_inHDF5... quote.shape = ", quote.shape) newdates = [] for i in range(len(newdatearray)): newdates.append(str(newdatearray[i])) #quoteupdate = pd.DataFrame( newadjClose.swapaxes(0,1), index=newdates, columns=symbols) quoteupdate = pd.DataFrame(newadjClose, index=newdates, columns=symbols) updatedquotes = quoteupdate.combine_first(quote) ################### from functions.TAfunctions import cleanspikes from functions.TAfunctions import interpolate from functions.TAfunctions import cleantobeginning # clean up quotes for missing values and varying starting date #x = quote.as_matrix().swapaxes(0,1) xupdate = updatedquotes.values.T symbolListupdate = list(updatedquotes.columns.values) # Clean up input quotes # - infill interior NaN values using nearest good values to linearly interpolate # - copy first valid quote to from valid date to all earlier positions #for ii in range(x.shape[0]): for ii, isymbolupdate in enumerate(symbolListupdate): ''' if ii%5 == 0: print " ... progress: ii, symbol = ", ii, isymbolupdate ''' #print(" ... progress: ii, symbol = ", ii, isymbolupdate) xupdate = updatedquotes[isymbolupdate].values print(" ... progress: ii, symbol, # nans = ", ii, isymbolupdate, xupdate[~np.isnan(xupdate)].shape) xupdate = cleanspikes(xupdate) xupdate = cleantobeginning(xupdate) xupdate = cleantoend(xupdate) xupdate = interpolate(xupdate, verbose=True) xupdate = cleantobeginning(xupdate) updatedquotes[isymbolupdate] = xupdate ################### if len(new_symbols) > 0: print("\n\n\n...quotes_NewSymbols = ", quotes_NewSymbols.info()) print("\n\n\n...updatedquotes = ", updatedquotes.info()) for isymbol in new_symbols: updatedquotes[isymbol] = quotes_NewSymbols[isymbol] print("\n\n\n...merged updatedquotes = ", updatedquotes.info()) CASHadjClose = np.ones((len(updatedquotes.index)), float) * 100000. for i in range(CASHadjClose.shape[0]): if i % 10 == 0: CASHadjClose[i] = CASHadjClose[i - 1] + .01 else: CASHadjClose[i] = CASHadjClose[i - 1] updatedquotes['CASH'] = CASHadjClose / 100000. # set up to write quotes to disk. dirname = os.path.join(os.getcwd(), "symbols") hdf5filename = os.path.join(dirname, listname + "_.hdf5") print("hdf5 filename = ", hdf5filename) #updatedquotes.to_hdf( hdf5filename, listname, mode='a',format='table',append=False,complevel=5,complib='blosc') updatedquotes.to_hdf(hdf5filename, listname, mode='a', format='table', append=False, complevel=5, complib='blosc') return
def cleanup_quotes(symbols_file,newquotesfirstdate, newquoteslastdate): # compare quotes currently on hdf with updated quotes from internet. print " ... inside compareHDF_and_newquotes ..." print " ... newquotesfirstdate = ", newquotesfirstdate print " ... newquoteslastdate = ", newquoteslastdate # get existing quotes from hdf (directory_name, file_name) = os.path.split(symbols_file) (shortname, extension) = os.path.splitext( file_name ) print "file name for symbols = ","_"+shortname+"_" print "file type for symbols = ",extension # set up to write quotes to disk. if shortname == "symbols" : listname = "TAA-Symbols" elif shortname == "cmg_symbols" : listname = "CMG-Symbols" elif shortname == "Naz100_Symbols" : listname = "Naz100_Symbols" elif shortname == "biglist" : listname = "biglist-Symbols" elif shortname == "ETF_symbols" : listname = "ETF-Symbols" elif shortname == "ProvidentFundSymbols" : listname = "ProvidentFund-Symbols" elif shortname == "sp500_symbols" : listname = "SP500-Symbols" else : listname = shortname hdf5_directory = os.path.join( os.getcwd(), "symbols" ) hdf5filename = os.path.join(hdf5_directory, listname + "_.hdf5") print "" print "" print "symbol_directory = ", directory_name print "symbols_file = ", symbols_file print "shortname, extension = ",shortname, extension print "hdf5filename = ",hdf5filename dataframeFromHDF = pd.read_hdf( hdf5filename, listname ) x_hdf = dataframeFromHDF.as_matrix() x_hdf = x_hdf.swapaxes(0,1) date_hdf = dataframeFromHDF.index symbols_hdf = list(dataframeFromHDF.columns.values) # Clean up input quotes # - infill interior NaN values using nearest good values to linearly interpolate # - copy first valid quote to from valid date to all earlier positions #for ii in range(x.shape[0]): for ii,isymbolupdate in enumerate(symbols_hdf): xupdate = dataframeFromHDF[isymbolupdate] ''' if isymbolupdate == 'SBUX': import pdb pdb.set_trace() ''' print " ... cleanup_quotes ... symbol = ", isymbolupdate xupdate = cleanspikes(xupdate) xupdate = interpolate(xupdate) xupdate = cleantobeginning(xupdate) dataframeFromHDF[isymbolupdate] = xupdate.copy() #xupdate[ii,:] = np.array(xupdate[ii,:]).astype('float') #xupdate[ii,:] = interpolate(xupdate[ii,:]) #xupdate[ii,:] = cleantobeginning(xupdate[ii,:]) dataframeFromHDF.to_hdf( hdf5filename, listname, mode='a',format='table',append=False,complevel=5,complib='blosc') return
def UpdateHDF5( symbol_directory, symbols_file ): ## ## Update symbols in 'symbols_file' with quotes more recent than last update. ## filename = os.path.join(symbol_directory, symbols_file) x, symbols, datearray, quote, listname = loadQuotes_fromHDF( filename ) # get last date in hdf5 archive #from datetime import datetime import datetime date = quote.index lastdate = getLastDateFromHDF5( symbol_directory, symbols_file ) ## ## Get quotes for each symbol in list ## process dates. ## Clean up quotes. ## Make a plot showing all symbols in list ## # locate symbols added to list that aren't in HDF5 file symbols_in_list = readSymbolList( filename, verbose=False) symbols_in_HDF5 = list(quote.columns.values) new_symbols = [x for x in symbols_in_list if x not in symbols_in_HDF5] # write new symbols to temporary file if len(new_symbols) > 0: # write new symbols to temporary file tempfilename = os.path.join(symbol_directory, "newsymbols_tempfile.txt") OUTFILE = open(tempfilename,"w",0) for i,isymbol in enumerate(new_symbols): print "new symbol = ", isymbol OUTFILE.write(str(isymbol) + "\n") newquotesfirstdate = datetime.date(1991,1,1) newquoteslastdate = datetime.date.today() # print dates to be used print "dates for new symbol found = ", newquotesfirstdate, newquoteslastdate newadjClose, newsymbols, newdatearray = arrayFromQuotesForList(tempfilename, newquotesfirstdate, newquoteslastdate) print " security values check: ",newadjClose[isnan(newadjClose)].shape newdates = [] for i in range(newdatearray.shape[0]): newdates.append(str(newdatearray[i])) #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float) quotes_NewSymbols = pd.DataFrame(newadjClose.swapaxes(0,1), index=newdates, columns=newsymbols) ## ## Get quotes for each symbol in list ## process dates. ## Clean up quotes. ## Make a plot showing all symbols in list ## if type(lastdate) == str: newquotesfirstdate = datetime.date(*[int(val) for val in lastdate.split('-')]) else: newquotesfirstdate = lastdate today = datetime.datetime.now() tomorrow = today + timedelta( days=1 ) newquoteslastdate = tomorrow newadjClose, symbols, newdatearray = arrayFromQuotesForList(filename, newquotesfirstdate, newquoteslastdate) print " security values check: ",newadjClose[isnan(newadjClose)].shape newdates = [] for i in range(newdatearray.shape[0]): newdates.append(str(newdatearray[i])) quoteupdate = pd.DataFrame( newadjClose.swapaxes(0,1), index=newdates, columns=symbols) updatedquotes = quoteupdate.combine_first( quote ) ################### from functions.TAfunctions import cleanspikes from functions.TAfunctions import interpolate from functions.TAfunctions import cleantobeginning # clean up quotes for missing values and varying starting date #x = quote.as_matrix().swapaxes(0,1) xupdate = updatedquotes.values.T symbolListupdate = list(updatedquotes.columns.values) # Clean up input quotes # - infill interior NaN values using nearest good values to linearly interpolate # - copy first valid quote to from valid date to all earlier positions #for ii in range(x.shape[0]): for ii,isymbolupdate in enumerate(symbolListupdate): ''' if ii%5 == 0: print " ... progress: ii, symbol = ", ii, isymbolupdate ''' xupdate = updatedquotes[isymbolupdate].values xupdate = cleanspikes(xupdate) xupdate = interpolate(xupdate) xupdate = cleantobeginning(xupdate) updatedquotes[isymbolupdate] = xupdate ################### if len(new_symbols) > 0: print "\n\n\n...quotes_NewSymbols = ", quotes_NewSymbols.info() print "\n\n\n...updatedquotes = ", updatedquotes.info() for isymbol in new_symbols: updatedquotes[isymbol] = quotes_NewSymbols[isymbol] print "\n\n\n...merged updatedquotes = ", updatedquotes.info() CASHadjClose = np.ones( (len(updatedquotes.index)), float ) * 100. for i in range(CASHadjClose.shape[0]): if i%10 == 0: CASHadjClose[i] = CASHadjClose[i-1] + .01 else: CASHadjClose[i] = CASHadjClose[i-1] updatedquotes['CASH'] = CASHadjClose # set up to write quotes to disk. dirname = os.path.join( os.getcwd(), "symbols" ) hdf5filename = os.path.join( dirname, listname + "_.hdf5" ) print "hdf5 filename = ",hdf5filename updatedquotes.to_hdf( hdf5filename, listname, mode='a',format='table',append=False,complevel=5,complib='blosc') return
def compareHDF_and_newquotes(symbols_file,newquotesfirstdate, newquoteslastdate): # compare quotes currently on hdf with updated quotes from internet. print " ... inside compareHDF_and_newquotes ..." print " ... newquotesfirstdate = ", newquotesfirstdate print " ... newquoteslastdate = ", newquoteslastdate # get existing quotes from hdf (directory_name, file_name) = os.path.split(symbols_file) (shortname, extension) = os.path.splitext( file_name ) print "file name for symbols = ","_"+shortname+"_" print "file type for symbols = ",extension # set up to write quotes to disk. if shortname == "symbols" : listname = "TAA-Symbols" elif shortname == "cmg_symbols" : listname = "CMG-Symbols" elif shortname == "Naz100_Symbols" : listname = "Naz100_Symbols" elif shortname == "biglist" : listname = "biglist-Symbols" elif shortname == "ETF_symbols" : listname = "ETF-Symbols" elif shortname == "ProvidentFundSymbols" : listname = "ProvidentFund-Symbols" elif shortname == "sp500_symbols" : listname = "SP500-Symbols" else : listname = shortname hdf5_directory = os.path.join( os.getcwd(), "symbols" ) hdf5filename = os.path.join(hdf5_directory, listname + "_.hdf5") print "" print "" print "symbol_directory = ", directory_name print "symbols_file = ", symbols_file print "shortname, extension = ",shortname, extension print "hdf5filename = ",hdf5filename dataframeFromHDF = pd.read_hdf( hdf5filename, listname ) x_hdf = dataframeFromHDF.as_matrix() x_hdf = x_hdf.swapaxes(0,1) date_hdf = dataframeFromHDF.index symbols_hdf = list(dataframeFromHDF.columns.values) # get new quotes dataframe from internet newadjClose, newsymbols, newdatearray = arrayFromQuotesForList(symbols_file, newquotesfirstdate, newquoteslastdate) print " security values check: ",newadjClose[isnan(newadjClose)].shape newdates = [] for i in range(newdatearray.shape[0]): newdates.append(str(newdatearray[i])) #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float) dataframeFromInternet = pd.DataFrame(newadjClose.swapaxes(0,1), index=newdates, columns=newsymbols) ################### from functions.TAfunctions import interpolate from functions.TAfunctions import cleantobeginning # clean up quotes for missing values and varying starting date #x = quote.as_matrix().swapaxes(0,1) ##xupdate = dataframeFromInternet.values.T symbolListupdate = list(dataframeFromInternet.columns.values) # Clean up input quotes # - infill interior NaN values using nearest good values to linearly interpolate # - copy first valid quote to from valid date to all earlier positions #for ii in range(x.shape[0]): for ii,isymbolupdate in enumerate(symbolListupdate): xupdate = dataframeFromInternet[isymbolupdate] ''' if isymbolupdate == 'SBUX': import pdb pdb.set_trace() ''' print " isymbolupdate,xupdate = ", isymbolupdate,xupdate.as_matrix() xupdate = cleanspikes(xupdate) xupdate = interpolate(xupdate) xupdate = cleantobeginning(xupdate) #xupdate[ii,:] = np.array(xupdate[ii,:]).astype('float') #xupdate[ii,:] = interpolate(xupdate[ii,:]) #xupdate[ii,:] = cleantobeginning(xupdate[ii,:]) ################### x_net = dataframeFromInternet.as_matrix() x_net = x_net.swapaxes(0,1) date_net = dataframeFromInternet.index symbols_net = list(dataframeFromInternet.columns.values) # find joined symbols symbols_all = symbols_hdf + symbols_net symbols_all = list(set(symbols_all)) symbols_all.sort() for isymbol in symbols_all: # find date range for shorter of quotes update from net or on hdf try: hdf_index = symbols_hdf.index(isymbol) firstindexup_hdf = np.argmax(np.clip(x_hdf[hdf_index,:]/x_hdf[hdf_index,0],1.,1.+1.e-5)) firstindexdown_hdf = np.argmin(np.clip(x_hdf[hdf_index,:]/x_hdf[hdf_index,0],1.-1.e-5,1.)) firstindex_hdf = max(firstindexup_hdf,firstindexdown_hdf) net_index = symbols_net.index(isymbol) firstindexup_net = np.argmax(np.clip(x_net[net_index,:]/x_net[net_index,0],1.,1.+1.e-5)) firstindexdown_net = np.argmin(np.clip(x_net[net_index,:]/x_net[net_index,0],1.-1.e-5,1.)) firstindex_net = max(firstindexup_net,firstindexdown_net) firstDate = max( date_net[firstindex_net], date_hdf[firstindex_hdf] ) lastDate = min( date_net[-1], date_hdf[-1] ) values_hdf = x_hdf[hdf_index,list(date_hdf).index(firstDate):list(date_hdf).index(lastDate)+1] values_net = x_net[net_index,list(date_net).index(firstDate):list(date_net).index(lastDate)+1] if False in values_hdf==values_net: print " ... **** symbol ", format(isymbol,'5s'), " is different in hdf and update from internet (", firstDate, " to ", lastDate, " )" else: print " ... symbol ", format(isymbol,'5s'), " is same in hdf and update from internet (", firstDate, " to ", lastDate, " )" except: print " ... **** **** symbol ", format(isymbol,'5s'), " not matched in hdf and update from internet" ''' if isymbol == 'SBUX': print " .... firstdate, lastdate = ", firstDate, lastDate datesForPlot = date_hdf[list(date_hdf).index(firstDate):list(date_hdf).index(lastDate)+1] _datesForPlot1=[] for i in range(len(datesForPlot)): datestr = datesForPlot[i] date_newformat = datetime.date(*[int(val) for val in datestr.split('-')]) #date_newformat = datestr _datesForPlot1.append(date_newformat) iindex = list(date_hdf).index(firstDate) + i #print "i,date_newformat,values_net['SBUX'] = ", i,date_newformat,x_net[net_index,iindex] print " .... _datesForPlot1 = ", _datesForPlot1 plt.figure() plt.grid() plt.plot(_datesForPlot1,values_hdf) plt.plot(_datesForPlot1,values_hdf,'b.') datesForPlot = date_net[list(date_net).index(firstDate):list(date_net).index(lastDate)+1] _datesForPlot2=[] for i in range(len(datesForPlot)): datestr = datesForPlot[i] date_newformat = datetime.date(*[int(val) for val in datestr.split('-')]) #date_newformat = datestr _datesForPlot2.append(date_newformat) print "\n\n\n .... _datesForPlot2 = ", _datesForPlot2 plt.plot(_datesForPlot2,values_net) plt.plot(_datesForPlot2,values_net,'g.') ''' return