def get_data_for_exchange (exchange, data_path): #data_access= da.DataAccess('norgate') #symbol_list= data_access.get_all_symbols() data_path= data_path + "/Raw/Yahoo/US/" + str (exchange) + "/" #Create path if it doesn't exist if not (os.access(data_path, os.F_OK)): os.makedirs(data_path) utils.clean_paths(data_path) symbol_list= list() print "Getting list of stocks.." try: nasdaq_params= urllib.urlencode ({'exchange':str(exchange), 'render':'download'}) nasdaq_get= urllib2.urlopen ('http://www.nasdaq.com/screening/companies-by-name.aspx?%s' % nasdaq_params) symbol_list.append (nasdaq_get.readline()) #Now we have all the data in a list- but we need only the symbols so we remove the rest while (len (symbol_list[-1]) > 0): symbol_list.append (nasdaq_get.readline()) #while ends symbol_list.pop(0) #This is just the word "symbol" and not a symbol itself symbol_list.pop(-1) #Remove the last element because its going to be blank anyway #symbol_list = map(lambda x:(x.partition(str(","))[0]),symbol_list) #Get the stuff before the first comma- which is the symbol #Unfortunately this symbol is in quotes. So we have to remove them now symbol_list = map(lambda x:(x.partition(str("\""))[2]),symbol_list) #Keep the stuff only after the first " symbol_list = map(lambda x:(x.partition(str("\""))[0]),symbol_list) #Keep the stuff before the second " except urllib2.HTTPError: print "Unable to get list of stocks from server. Please check your internet connection and retry." except: print"Unknown error occoured when getting list of stocks from server." print "Got " + str (len(symbol_list)) + " symbols. Now getting symbol data..." _now =datetime.datetime.now(); miss_ctr=0; #Counts how many symbols we could get for symbol in symbol_list: symbol_data=list() print "Getting " + str (symbol) try: params= urllib.urlencode ({'a':03, 'b':12, 'c':2000, 'd':_now.month, 'e':_now.day, 'f':_now.year, 's': str(symbol)}) url_get= urllib2.urlopen("http://ichart.finance.yahoo.com/table.csv?%s" % params) header= url_get.readline() symbol_data.append (url_get.readline()) while (len(symbol_data[-1]) > 0): symbol_data.append(url_get.readline()) # print str(symbol_data[-1]) symbol_data.pop(-1) #The last element is going to be the string of length zero. We don't want to write that to file. #To change adjusted close so that Yahoo data is same as Norgate data symbol_data= map (adjust, symbol_data) #Following changes so that the data looks like Norgate data and the change to cav_to_pkl.csv is minimized symbol_data = map(lambda x:(x.replace("-", "")),symbol_data) symbol_data = map(lambda x:(str(symbol) + "," + str(x)) ,symbol_data) #This means that the header is wrong but since it is ignored later anyways- this will work #now writing data to file f= open (data_path + symbol + ".csv", 'w') #Writing the header f.write (header) while (len(symbol_data) > 0): f.write (symbol_data.pop()) f.close();
def main (): print("Starting..."+ str(time.strftime("%H:%M:%S"))) try: rootdir = os.environ['QSDATA'] except KeyError: #rootdir = "/hzr71/research/QSData" print("Please be sure to set the value for QSDATA in config.sh or local.sh\n") fileExtensionToRemove = ".csv" listOfInputPaths= list() #For Gekko rootindir = rootdir + "/Raw/Norgate/Stocks/CSV/US" listOfInputPaths.append (rootindir + "/AMEX/") listOfInputPaths.append (rootindir + "/Delisted Securities/") listOfInputPaths.append (rootindir + "/NASDAQ/") listOfInputPaths.append (rootindir + "/NYSE/") listOfInputPaths.append (rootindir + "/NYSE Arca/") listOfInputPaths.append (rootindir + "/OTC/") listOfInputPaths.append (rootindir + "/Indices/ADRs/") listOfInputPaths.append (rootindir + "/Indices/AMEX/") listOfInputPaths.append (rootindir + "/Indices/CBOE/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Americas/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Averages/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Broad Market/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Misc/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Style/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Industries/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Sectors/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Subsectors/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Supersectors/") listOfInputPaths.append (rootindir + "/Indices/ISE/") listOfInputPaths.append (rootindir + "/Indices/Merrill Lynch/") listOfInputPaths.append (rootindir + "/Indices/Misc/") listOfInputPaths.append (rootindir + "/Indices/Morgan Stanley/") listOfInputPaths.append (rootindir + "/Indices/NASDAQ/") listOfInputPaths.append (rootindir + "/Indices/NYSE/") listOfInputPaths.append (rootindir + "/Indices/NYSE Arca/") listOfInputPaths.append (rootindir + "/Indices/PHLX/") listOfInputPaths.append (rootindir + "/Indices/Russell/") listOfInputPaths.append (rootindir + "/Indices/S&P/") listOfInputPaths.append (rootindir + "/Indices/S&P 500 Industries/") listOfInputPaths.append (rootindir + "/Indices/S&P 500 Industry Groups/") listOfInputPaths.append (rootindir + "/Indices/S&P 500 Sectors/") listOfInputPaths.append (rootindir + "/Indices/S&P 500 Sub-Industries/") listOfInputPaths.append (rootindir + "/Indices/S&P Technology/") listOfInputPaths.append (rootindir + "/Indices/S&P TSX/") listOfInputPaths.append (rootindir + "/Indices/Wilshire/") rootoutdir = rootdir + "/Processed/Norgate/Stocks/US" listOfOutputPaths= list() listOfOutputPaths.append(rootoutdir + "/AMEX/") listOfOutputPaths.append(rootoutdir + "/Delisted Securities/") listOfOutputPaths.append(rootoutdir + "/NASDAQ/") listOfOutputPaths.append(rootoutdir + "/NYSE/") listOfOutputPaths.append(rootoutdir + "/NYSE Arca/") listOfOutputPaths.append(rootoutdir + "/OTC/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") #If the output paths don't exist, then create them... for path in listOfOutputPaths: if not (os.access(path, os.F_OK)): #Path does not exist, so create it os.makedirs(path) #done making all output paths! #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run utils.clean_paths (listOfOutputPaths) if (len(listOfInputPaths)!= len(listOfOutputPaths)): print("No. of input paths not equal to the number of output paths.. quitting") sys.exit("FAILURE") #if ends path_ctr = -1; use_cols = list(range(1, 7 + 1)) # will now use cols 1 to 7 for path in listOfInputPaths: path_ctr = path_ctr + 1; stocks_at_this_path = dircache.listdir(str(path)) #Next, throw away everything that is not a .csv And these are our stocks! Example: this should throw away the '$' folder in the NYSE folder filtered_names= [x for x in stocks_at_this_path if (str(x).find(str(fileExtensionToRemove)) > -1)] #Now, we remove the .csv to get the name of the stock filtered_names = [(x.partition(str(fileExtensionToRemove))[0]) for x in filtered_names] stock_ctr = -1 for stock in filtered_names: stock_ctr = stock_ctr + 1 print("csv_to_pkl: processing: " + str (path + stock)) #read in the stock date from the CSV file stock_data= np.loadtxt (path + stock+".csv", np.float, None, ",", None, 1, use_cols) stock_data_shape = stock_data.shape #print "stock_data_shape is: " + str(stock_data_shape) f = open (listOfOutputPaths[path_ctr] + filtered_names[stock_ctr] + ".pkl", "wb" ) pkl.dump (stock_data, f, -1) f.close() #for stock in stocks_at_this_path ends #for path in listOfInputPaths ends print("Finished..."+ str(time.strftime("%H:%M:%S")))
def convert (): ''' @summary: Converts a Compustat CSV file to pickle files of numpy arrays. ''' print("Starting..."+ str(time.strftime("%H:%M:%S"))) ''' Write every so often to save memory, 20k lines is usually < .5GB ''' lSaveMem = 20000 try: rootdir = os.environ['QSDATA'] except KeyError: print("Please be sure to set the value for QSDATA in config.sh or local.sh\n") ''' Create lists of input and output paths ''' fFile = ( rootdir + "/Raw/Compustat/Compustat.csv") listOfOutputPaths= [] listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/AMEX/") listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/NASDAQ/") listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/NYSE/") #If the output paths don't exist, then create them... for path in listOfOutputPaths: if not (os.access(path, os.F_OK)): #Path does not exist, so create it os.makedirs(path) #Makes paths recursively #done making all output paths! #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run utils.clean_paths (listOfOutputPaths) spamReader = csv.reader(open(fFile, 'rb'), delimiter=',') ''' 1378625 rows typical ''' ''' Take first row as the labels ''' for row in spamReader: lsLabels = row break ''' Generated from _Analyze() ''' lsBadLabels = set(['LOC', 'ADD4', 'ADD3', 'ADD2', 'ADD1', 'ACCTCHGQ', 'WEBURL', 'IDBFLAG', 'popsrc', 'DATACQTR', 'conm', 'COSTAT', 'FINALQ', 'fdateq', 'FAX', 'RP', 'PRIROW', 'dldte', 'indfmt', 'SPCSRC', 'BUSDESC', 'ipodate', 'PHONE', 'CURCDQ', 'pdateq', 'DATAFQTR', 'PRICAN', 'EIN', 'datadate', 'tic', 'ADDZIP', 'CONML', 'consol', 'datafmt', 'cusip', 'BSPRQ', 'OGMQ', 'COMPSTQ', 'COUNTY', 'STATE', 'CURNCDQ', 'CITY', 'rdq', 'apdedateq', 'STALTQ', 'INCORP']) ''' get list of stocks in 3 US indexes ''' Access = da.DataAccess( 'Norgate' ) setNyse = set( Access.get_symbols_in_sublist("/US/NYSE") ) setNasdaq = set( Access.get_symbols_in_sublist("/US/NASDAQ") ) setAmex = set( Access.get_symbols_in_sublist("/US/AMEX") ) ''' If stock appears in more than one index, remove to avoid ambiguity ''' print('Ignoring duplicate stocks:', end=' ') dup1 = setNyse.intersection( setNasdaq.union(setAmex)) dup2 = setNasdaq.intersection( setAmex ) print(dup1.union(dup2)) setNyse = setNyse - dup1.union(dup2) setAmex = setAmex - dup1.union(dup2) setNasdaq = setNasdaq - dup1.union(dup2) ''' Note the two lists below must be in the same order ''' lsOutPaths = [] lsOutPaths.append(rootdir + "/Processed/Compustat/US/AMEX/") lsOutPaths.append(rootdir + "/Processed/Compustat/US/NASDAQ/") lsOutPaths.append(rootdir + "/Processed/Compustat/US/NYSE/") lSets = [setAmex, setNasdaq, setNyse] #If the output paths don't exist, then create them... for path in lsOutPaths: if not (os.access(path, os.F_OK)): #Path does not exist, so create it os.makedirs(path) #Makes paths recursively #done making all output paths! #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run utils.clean_paths (lsOutPaths) lDateCol = 0 llUseCols = [] ''' We have first row (the labels), loop through saving good label indicies ''' for j, sElem in enumerate(lsLabels): if( sElem not in lsBadLabels ): llUseCols.append(j) ''' Keep track of ticker column and date, specially handled later ''' if( sElem == 'datadate' ): lDateCol = j if( sElem == 'tic' ): lTicCol = j ''' Dict of ticker->numpy array mapping ''' dData = dict() print('') ''' Main loop, iterate over the rows in the csv file ''' for j, row in enumerate(spamReader): lsLabels = row sTic = row[lTicCol] ''' Find out what index this belongs to ''' lIndex = -1 for i, symSet in enumerate( lSets ): if sTic in symSet: lIndex = i break if lIndex == -1: continue sFilename = lsOutPaths[lIndex] + sTic + '.pkl' ''' If the file exists (temporary memory saving measure), read it in and delete file from disk ''' if( os.path.isfile(sFilename)): if sTic in dData: print('File should not be both on disk and in dict') sys.exit("FAILURE") fIn = open( sFilename, 'rb' ) dData[sTic] = pkl.load( fIn ) fIn.close() os.remove( sFilename ) fDate = float( dt.datetime.strptime( row[lDateCol], "%m/%d/%Y").strftime("%Y%m%d") ) ''' convert blanks to nans ''' for i in llUseCols: if row[i] == '': row[i] = 'nan' ''' Add row if data exists, if not, create new array ''' if sTic in dData: dData[sTic] = np.vstack( (dData[sTic], np.array([fDate] + [row[i] for i in llUseCols], dtype=np.float)) ) else: dData[sTic]= np.array( [fDate] + [row[i] for i in llUseCols], dtype=np.float ) if( (j+1) % 1000 == 0): fDone = (j / 1378625.0) * 100 print('\rApprox %.2lf%%'%((j / 1378625.0) * 100), end=' ') if( (j+1) % lSaveMem == 0): ''' Write all the pickle files we currently have ''' print('\nWriting %i lines to pickle files to save memory\n'%(lSaveMem)) _dumpFiles( dData, lSets, lsOutPaths) ''' Remember to delete! ''' del dData dData = dict() # Done writing files # Done with main loop print('') print('Writing final pickle files\n') _dumpFiles( dData, lSets, lsOutPaths) del dData print("Finished..."+ str(time.strftime("%H:%M:%S"))) return
def main (): print "Starting..."+ str(time.strftime("%H:%M:%S")) try: rootdir = os.environ['QSDATA'] except KeyError: #rootdir = "/hzr71/research/QSData" print "Please be sure to set the value for QSDATA in config.sh or local.sh\n" fileExtensionToRemove = ".csv" listOfInputPaths= list() #For Gekko rootindir = rootdir + "/Raw/Norgate/Stocks/CSV/US" listOfInputPaths.append (rootindir + "/AMEX/") listOfInputPaths.append (rootindir + "/Delisted Securities/") listOfInputPaths.append (rootindir + "/NASDAQ/") listOfInputPaths.append (rootindir + "/NYSE/") listOfInputPaths.append (rootindir + "/NYSE Arca/") listOfInputPaths.append (rootindir + "/OTC/") listOfInputPaths.append (rootindir + "/Indices/ADRs/") listOfInputPaths.append (rootindir + "/Indices/AMEX/") listOfInputPaths.append (rootindir + "/Indices/CBOE/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Americas/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Averages/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Broad Market/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Misc/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones Style/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Industries/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Sectors/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Subsectors/") listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Supersectors/") listOfInputPaths.append (rootindir + "/Indices/ISE/") listOfInputPaths.append (rootindir + "/Indices/Merrill Lynch/") listOfInputPaths.append (rootindir + "/Indices/Misc/") listOfInputPaths.append (rootindir + "/Indices/Morgan Stanley/") listOfInputPaths.append (rootindir + "/Indices/NASDAQ/") listOfInputPaths.append (rootindir + "/Indices/NYSE/") listOfInputPaths.append (rootindir + "/Indices/NYSE Arca/") listOfInputPaths.append (rootindir + "/Indices/PHLX/") listOfInputPaths.append (rootindir + "/Indices/Russell/") listOfInputPaths.append (rootindir + "/Indices/S&P/") listOfInputPaths.append (rootindir + "/Indices/S&P 500 Industries/") listOfInputPaths.append (rootindir + "/Indices/S&P 500 Industry Groups/") listOfInputPaths.append (rootindir + "/Indices/S&P 500 Sectors/") listOfInputPaths.append (rootindir + "/Indices/S&P 500 Sub-Industries/") listOfInputPaths.append (rootindir + "/Indices/S&P Technology/") listOfInputPaths.append (rootindir + "/Indices/S&P TSX/") listOfInputPaths.append (rootindir + "/Indices/Wilshire/") rootoutdir = rootdir + "/Processed/Norgate/Stocks/US" listOfOutputPaths= list() listOfOutputPaths.append(rootoutdir + "/AMEX/") listOfOutputPaths.append(rootoutdir + "/Delisted Securities/") listOfOutputPaths.append(rootoutdir + "/NASDAQ/") listOfOutputPaths.append(rootoutdir + "/NYSE/") listOfOutputPaths.append(rootoutdir + "/NYSE Arca/") listOfOutputPaths.append(rootoutdir + "/OTC/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") listOfOutputPaths.append(rootoutdir + "/Indices/") #If the output paths don't exist, then create them... for path in listOfOutputPaths: if not (os.access(path, os.F_OK)): #Path does not exist, so create it os.makedirs(path) #done making all output paths! #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run utils.clean_paths (listOfOutputPaths) if (len(listOfInputPaths)!= len(listOfOutputPaths)): print "No. of input paths not equal to the number of output paths.. quitting" sys.exit("FAILURE") #if ends path_ctr = -1; use_cols = range (1, 7 + 1) # will now use cols 1 to 7 for path in listOfInputPaths: path_ctr = path_ctr + 1; stocks_at_this_path = dircache.listdir(str(path)) #Next, throw away everything that is not a .csv And these are our stocks! Example: this should throw away the '$' folder in the NYSE folder filtered_names= filter (lambda x:(str(x).find(str(fileExtensionToRemove)) > -1), stocks_at_this_path) #Now, we remove the .csv to get the name of the stock filtered_names = map(lambda x:(x.partition(str(fileExtensionToRemove))[0]),filtered_names) stock_ctr = -1 for stock in filtered_names: stock_ctr = stock_ctr + 1 print "csv_to_pkl: processing: " + str (path + stock) #read in the stock date from the CSV file stock_data= np.loadtxt (path + stock+".csv", np.float, None, ",", None, 1, use_cols) stock_data_shape = stock_data.shape #print "stock_data_shape is: " + str(stock_data_shape) f = open (listOfOutputPaths[path_ctr] + filtered_names[stock_ctr] + ".pkl", "wb" ) pkl.dump (stock_data, f, -1) f.close() #for stock in stocks_at_this_path ends #for path in listOfInputPaths ends print "Finished..."+ str(time.strftime("%H:%M:%S"))
def get_data_for_exchange(exchange, data_path): # data_access= da.DataAccess('norgate') # symbol_list= data_access.get_all_symbols() data_path = data_path + "/Raw/Yahoo/US/" + str(exchange) + "/" # Create path if it doesn't exist if not (os.access(data_path, os.F_OK)): os.makedirs(data_path) utils.clean_paths(data_path) symbol_list = list() print "Getting list of stocks.." try: nasdaq_params = urllib.urlencode({"exchange": str(exchange), "render": "download"}) nasdaq_get = urllib2.urlopen("http://www.nasdaq.com/screening/companies-by-name.aspx?%s" % nasdaq_params) symbol_list.append( nasdaq_get.readline() ) # Now we have all the data in a list- but we need only the symbols so we remove the rest while len(symbol_list[-1]) > 0: symbol_list.append(nasdaq_get.readline()) # while ends symbol_list.pop(0) # This is just the word "symbol" and not a symbol itself symbol_list.pop(-1) # Remove the last element because its going to be blank anyway # symbol_list = map(lambda x:(x.partition(str(","))[0]),symbol_list) #Get the stuff before the first comma- which is the symbol # Unfortunately this symbol is in quotes. So we have to remove them now symbol_list = map(lambda x: (x.partition(str('"'))[2]), symbol_list) # Keep the stuff only after the first " symbol_list = map(lambda x: (x.partition(str('"'))[0]), symbol_list) # Keep the stuff before the second " except urllib2.HTTPError: print "Unable to get list of stocks from server. Please check your internet connection and retry." except: print "Unknown error occoured when getting list of stocks from server." print "Got " + str(len(symbol_list)) + " symbols. Now getting symbol data..." _now = datetime.datetime.now() miss_ctr = 0 # Counts how many symbols we could get for symbol in symbol_list: symbol_data = list() print "Getting " + str(symbol) try: params = urllib.urlencode( {"a": 03, "b": 12, "c": 2000, "d": _now.month, "e": _now.day, "f": _now.year, "s": str(symbol)} ) url_get = urllib2.urlopen("http://ichart.finance.yahoo.com/table.csv?%s" % params) header = url_get.readline() symbol_data.append(url_get.readline()) while len(symbol_data[-1]) > 0: symbol_data.append(url_get.readline()) # print str(symbol_data[-1]) symbol_data.pop( -1 ) # The last element is going to be the string of length zero. We don't want to write that to file. # To change adjusted close so that Yahoo data is same as Norgate data symbol_data = map(adjust, symbol_data) # Following changes so that the data looks like Norgate data and the change to cav_to_pkl.csv is minimized symbol_data = map(lambda x: (x.replace("-", "")), symbol_data) symbol_data = map( lambda x: (str(symbol) + "," + str(x)), symbol_data ) # This means that the header is wrong but since it is ignored later anyways- this will work # now writing data to file f = open(data_path + symbol + ".csv", "w") # Writing the header f.write(header) while len(symbol_data) > 0: f.write(symbol_data.pop()) f.close()
def main (): print "Starting..."+ str(time.strftime("%H:%M:%S")) try: rootdir = os.environ['QSDATA'] except KeyError: #rootdir = "/hzr71/research/QSData" print "Please be sure to set the value for QSDATA in config.sh or local.sh\n" fileExtensionToRemove = ".csv" listOfInputPaths= list() #For Gekko #listOfInputPaths.append("/hzr71/research/QSData/Processed/Norgate/raw/Delisted Securities/US Recent/") listOfInputPaths.append (rootdir + "/Raw/Yahoo/US/AMEX/") listOfInputPaths.append (rootdir + "/Raw/Yahoo/US/NASDAQ/") listOfInputPaths.append (rootdir + "/Raw/Yahoo/US/NYSE/") listOfOutputPaths= list() # listOfOutputPaths.append("C:\\test\\temp\\pkl1\\") # listOfOutputPaths.append("C:\\test\\temp\\pkl2\\") #listOfOutputPaths.append(rootdir + "/Norgate/Delisted Securities/US Recent/") listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/AMEX/") listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/NASDAQ/") listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/NYSE/") #If the output paths don't exist, then create them... for path in listOfOutputPaths: if not (os.access(path, os.F_OK)): #Path does not exist, so create it os.makedirs(path) #Makes paths recursively #done making all output paths! #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run utils.clean_paths (listOfOutputPaths) if (len(listOfInputPaths)!= len(listOfOutputPaths)): print "No. of input paths not equal to the number of output paths.. quitting" sys.exit("FAILURE") #if ends path_ctr = -1; use_cols = range (1, 7 + 1) # will now use cols 1 to 7 for path in listOfInputPaths: path_ctr = path_ctr + 1; stocks_at_this_path = dircache.listdir(str(path)) #Next, throw away everything that is not a .csv And these are our stocks! Example: this should throw away the '$' folder in the NYSE folder filtered_names= filter (lambda x:(str(x).find(str(fileExtensionToRemove)) > -1), stocks_at_this_path) #Now, we remove the .csv to get the name of the stock filtered_names = map(lambda x:(x.partition(str(fileExtensionToRemove))[0]),filtered_names) stock_ctr = -1 for stock in filtered_names: stock_ctr = stock_ctr + 1 print "Reading file: " + str (path + stock) #read in the stock date from the CSV file stock_data= np.loadtxt (path + stock+".csv", np.float, None, ",", None, 1, use_cols) stock_data_shape = stock_data.shape #print "stock_data_shape is: " + str(stock_data_shape) # for i in range (0, stock_data_shape[0]): # print stock_data [i,: ] # print "Reading: " + str(stock) f = open (listOfOutputPaths[path_ctr] + filtered_names[stock_ctr] + ".pkl", "wb" ) pkl.dump (stock_data, f, -1) f.close() #for stock in stocks_at_this_path ends #for path in listOfInputPaths ends print "Finished..."+ str(time.strftime("%H:%M:%S"))
def main(): print "Starting..." + str(time.strftime("%H:%M:%S")) try: rootdir = os.environ['QSDATA'] except KeyError: #rootdir = "/hzr71/research/QSData" print "Please be sure to set the value for QSDATA in config.sh or local.sh\n" fileExtensionToRemove = ".csv" listOfInputPaths = list() #For Gekko #listOfInputPaths.append("/hzr71/research/QSData/Processed/Norgate/raw/Delisted Securities/US Recent/") listOfInputPaths.append(rootdir + "/Raw/Yahoo/US/AMEX/") listOfInputPaths.append(rootdir + "/Raw/Yahoo/US/NASDAQ/") listOfInputPaths.append(rootdir + "/Raw/Yahoo/US/NYSE/") listOfOutputPaths = list() # listOfOutputPaths.append("C:\\test\\temp\\pkl1\\") # listOfOutputPaths.append("C:\\test\\temp\\pkl2\\") #listOfOutputPaths.append(rootdir + "/Norgate/Delisted Securities/US Recent/") listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/AMEX/") listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/NASDAQ/") listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/NYSE/") #If the output paths don't exist, then create them... for path in listOfOutputPaths: if not (os.access(path, os.F_OK)): #Path does not exist, so create it os.makedirs(path) #Makes paths recursively #done making all output paths! #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run utils.clean_paths(listOfOutputPaths) if (len(listOfInputPaths) != len(listOfOutputPaths)): print "No. of input paths not equal to the number of output paths.. quitting" sys.exit("FAILURE") #if ends path_ctr = -1 use_cols = range(1, 7 + 1) # will now use cols 1 to 7 for path in listOfInputPaths: path_ctr = path_ctr + 1 stocks_at_this_path = dircache.listdir(str(path)) #Next, throw away everything that is not a .csv And these are our stocks! Example: this should throw away the '$' folder in the NYSE folder filtered_names = filter( lambda x: (str(x).find(str(fileExtensionToRemove)) > -1), stocks_at_this_path) #Now, we remove the .csv to get the name of the stock filtered_names = map( lambda x: (x.partition(str(fileExtensionToRemove))[0]), filtered_names) stock_ctr = -1 for stock in filtered_names: stock_ctr = stock_ctr + 1 print "Reading file: " + str(path + stock) #read in the stock date from the CSV file stock_data = np.loadtxt(path + stock + ".csv", np.float, None, ",", None, 1, use_cols) stock_data_shape = stock_data.shape #print "stock_data_shape is: " + str(stock_data_shape) # for i in range (0, stock_data_shape[0]): # print stock_data [i,: ] # print "Reading: " + str(stock) f = open( listOfOutputPaths[path_ctr] + filtered_names[stock_ctr] + ".pkl", "wb") pkl.dump(stock_data, f, -1) f.close() #for stock in stocks_at_this_path ends #for path in listOfInputPaths ends print "Finished..." + str(time.strftime("%H:%M:%S"))
def convert (): ''' @summary: Converts a Compustat CSV file to pickle files of numpy arrays. ''' print "Starting..."+ str(time.strftime("%H:%M:%S")) ''' Write every so often to save memory, 20k lines is usually < .5GB ''' lSaveMem = 20000 try: rootdir = os.environ['QSDATA'] except KeyError: print "Please be sure to set the value for QSDATA in config.sh or local.sh\n" ''' Create lists of input and output paths ''' fFile = ( rootdir + "/Raw/Compustat/Compustat.csv") listOfOutputPaths= [] listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/AMEX/") listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/NASDAQ/") listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/NYSE/") #If the output paths don't exist, then create them... for path in listOfOutputPaths: if not (os.access(path, os.F_OK)): #Path does not exist, so create it os.makedirs(path) #Makes paths recursively #done making all output paths! #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run utils.clean_paths (listOfOutputPaths) spamReader = csv.reader(open(fFile, 'rb'), delimiter=',') ''' 1378625 rows typical ''' ''' Take first row as the labels ''' for row in spamReader: lsLabels = row break ''' Generated from _Analyze() ''' lsBadLabels = set(['LOC', 'ADD4', 'ADD3', 'ADD2', 'ADD1', 'ACCTCHGQ', 'WEBURL', 'IDBFLAG', 'popsrc', 'DATACQTR', 'conm', 'COSTAT', 'FINALQ', 'fdateq', 'FAX', 'RP', 'PRIROW', 'dldte', 'indfmt', 'SPCSRC', 'BUSDESC', 'ipodate', 'PHONE', 'CURCDQ', 'pdateq', 'DATAFQTR', 'PRICAN', 'EIN', 'datadate', 'tic', 'ADDZIP', 'CONML', 'consol', 'datafmt', 'cusip', 'BSPRQ', 'OGMQ', 'COMPSTQ', 'COUNTY', 'STATE', 'CURNCDQ', 'CITY', 'rdq', 'apdedateq', 'STALTQ', 'INCORP']) ''' get list of stocks in 3 US indexes ''' Access = da.DataAccess( 'Norgate' ) setNyse = set( Access.get_symbols_in_sublist("/US/NYSE") ) setNasdaq = set( Access.get_symbols_in_sublist("/US/NASDAQ") ) setAmex = set( Access.get_symbols_in_sublist("/US/AMEX") ) ''' If stock appears in more than one index, remove to avoid ambiguity ''' print 'Ignoring duplicate stocks:', dup1 = setNyse.intersection( setNasdaq.union(setAmex)) dup2 = setNasdaq.intersection( setAmex ) print dup1.union(dup2) setNyse = setNyse - dup1.union(dup2) setAmex = setAmex - dup1.union(dup2) setNasdaq = setNasdaq - dup1.union(dup2) ''' Note the two lists below must be in the same order ''' lsOutPaths = [] lsOutPaths.append(rootdir + "/Processed/Compustat/US/AMEX/") lsOutPaths.append(rootdir + "/Processed/Compustat/US/NASDAQ/") lsOutPaths.append(rootdir + "/Processed/Compustat/US/NYSE/") lSets = [setAmex, setNasdaq, setNyse] #If the output paths don't exist, then create them... for path in lsOutPaths: if not (os.access(path, os.F_OK)): #Path does not exist, so create it os.makedirs(path) #Makes paths recursively #done making all output paths! #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run utils.clean_paths (lsOutPaths) lDateCol = 0 llUseCols = [] ''' We have first row (the labels), loop through saving good label indicies ''' for j, sElem in enumerate(lsLabels): if( sElem not in lsBadLabels ): llUseCols.append(j) ''' Keep track of ticker column and date, specially handled later ''' if( sElem == 'datadate' ): lDateCol = j if( sElem == 'tic' ): lTicCol = j ''' Dict of ticker->numpy array mapping ''' dData = dict() print '' ''' Main loop, iterate over the rows in the csv file ''' for j, row in enumerate(spamReader): lsLabels = row sTic = row[lTicCol] ''' Find out what index this belongs to ''' lIndex = -1 for i, symSet in enumerate( lSets ): if sTic in symSet: lIndex = i break if lIndex == -1: continue sFilename = lsOutPaths[lIndex] + sTic + '.pkl' ''' If the file exists (temporary memory saving measure), read it in and delete file from disk ''' if( os.path.isfile(sFilename)): if dData.has_key(sTic): print 'File should not be both on disk and in dict' sys.exit("FAILURE") fIn = open( sFilename, 'rb' ) dData[sTic] = pkl.load( fIn ) fIn.close() os.remove( sFilename ) fDate = float( dt.datetime.strptime( row[lDateCol], "%m/%d/%Y").strftime("%Y%m%d") ) ''' convert blanks to nans ''' for i in llUseCols: if row[i] == '': row[i] = 'nan' ''' Add row if data exists, if not, create new array ''' if dData.has_key(sTic): dData[sTic] = np.vstack( (dData[sTic], np.array([fDate] + [row[i] for i in llUseCols], dtype=np.float)) ) else: dData[sTic]= np.array( [fDate] + [row[i] for i in llUseCols], dtype=np.float ) if( (j+1) % 1000 == 0): fDone = (j / 1378625.0) * 100 print '\rApprox %.2lf%%'%((j / 1378625.0) * 100), if( (j+1) % lSaveMem == 0): ''' Write all the pickle files we currently have ''' print '\nWriting %i lines to pickle files to save memory\n'%(lSaveMem) _dumpFiles( dData, lSets, lsOutPaths) ''' Remember to delete! ''' del dData dData = dict() # Done writing files # Done with main loop print '' print 'Writing final pickle files\n' _dumpFiles( dData, lSets, lsOutPaths) del dData print "Finished..."+ str(time.strftime("%H:%M:%S")) return