Пример #1
0
def get_data_for_exchange (exchange, data_path):
    
    #data_access= da.DataAccess('norgate')
    #symbol_list= data_access.get_all_symbols()
    data_path= data_path + "/Raw/Yahoo/US/" + str (exchange) + "/"
    
    #Create path if it doesn't exist
    if not (os.access(data_path, os.F_OK)):
        os.makedirs(data_path)
        
    utils.clean_paths(data_path)    
    
    symbol_list= list()
    
    print "Getting list of stocks.."
    
    try:
        nasdaq_params= urllib.urlencode ({'exchange':str(exchange), 'render':'download'})
        nasdaq_get= urllib2.urlopen ('http://www.nasdaq.com/screening/companies-by-name.aspx?%s' % nasdaq_params)
        symbol_list.append (nasdaq_get.readline()) #Now we have all the data in a list- but we need only the symbols so we remove the rest
        while (len (symbol_list[-1]) > 0):
            symbol_list.append (nasdaq_get.readline())
            #while ends
        symbol_list.pop(0) #This is just the word "symbol" and not a symbol itself    
        symbol_list.pop(-1) #Remove the last element because its going to be blank anyway    
        #symbol_list = map(lambda x:(x.partition(str(","))[0]),symbol_list) #Get the stuff before the first comma- which is the symbol
        
        #Unfortunately this symbol is in quotes. So we have to remove them now
        symbol_list = map(lambda x:(x.partition(str("\""))[2]),symbol_list) #Keep the stuff only after the first "
        symbol_list = map(lambda x:(x.partition(str("\""))[0]),symbol_list) #Keep the stuff before the second "
        
    except urllib2.HTTPError:
        print "Unable to get list of stocks from server. Please check your internet connection and retry."
    except:
        print"Unknown error occoured when getting list of stocks from server."
    
    print "Got " + str (len(symbol_list)) + " symbols. Now getting symbol data..."
    
    _now =datetime.datetime.now();
    miss_ctr=0; #Counts how many symbols we could get
    for symbol in symbol_list:
        symbol_data=list()
        print "Getting " + str (symbol)
        
        try:
            params= urllib.urlencode ({'a':03, 'b':12, 'c':2000, 'd':_now.month, 'e':_now.day, 'f':_now.year, 's': str(symbol)})
            url_get= urllib2.urlopen("http://ichart.finance.yahoo.com/table.csv?%s" % params)
            
            header= url_get.readline()
            symbol_data.append (url_get.readline())
            while (len(symbol_data[-1]) > 0):
                symbol_data.append(url_get.readline())
#                print str(symbol_data[-1])
            
            symbol_data.pop(-1) #The last element is going to be the string of length zero. We don't want to write that to file.

            #To change adjusted close so that Yahoo data is same as Norgate data
            symbol_data= map (adjust, symbol_data)
            
            #Following changes so that the data looks like Norgate data and the change to cav_to_pkl.csv is minimized
            symbol_data = map(lambda x:(x.replace("-", "")),symbol_data) 
            symbol_data = map(lambda x:(str(symbol) + "," + str(x)) ,symbol_data) #This means that the header is wrong but since it is ignored later anyways- this will work
            
            #now writing data to file
            f= open (data_path + symbol + ".csv", 'w')
            
            #Writing the header
            f.write (header)
            
            while (len(symbol_data) > 0):
                f.write (symbol_data.pop())
             
            f.close();    
def main ():
    
    print("Starting..."+ str(time.strftime("%H:%M:%S")))
    
    try:
        rootdir = os.environ['QSDATA']
    except KeyError:
        #rootdir = "/hzr71/research/QSData"
        print("Please be sure to set the value for QSDATA in config.sh or local.sh\n")    
    
    fileExtensionToRemove = ".csv"
    
    listOfInputPaths= list()

    
#For Gekko
    rootindir = rootdir + "/Raw/Norgate/Stocks/CSV/US"
    listOfInputPaths.append (rootindir + "/AMEX/")
    listOfInputPaths.append (rootindir + "/Delisted Securities/")
    listOfInputPaths.append (rootindir + "/NASDAQ/")
    listOfInputPaths.append (rootindir + "/NYSE/")
    listOfInputPaths.append (rootindir + "/NYSE Arca/")
    listOfInputPaths.append (rootindir + "/OTC/")
    listOfInputPaths.append (rootindir + "/Indices/ADRs/")
    listOfInputPaths.append (rootindir + "/Indices/AMEX/")
    listOfInputPaths.append (rootindir + "/Indices/CBOE/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Americas/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Averages/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Broad Market/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Misc/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Style/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Industries/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Sectors/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Subsectors/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Supersectors/")
    listOfInputPaths.append (rootindir + "/Indices/ISE/")
    listOfInputPaths.append (rootindir + "/Indices/Merrill Lynch/")
    listOfInputPaths.append (rootindir + "/Indices/Misc/")
    listOfInputPaths.append (rootindir + "/Indices/Morgan Stanley/")
    listOfInputPaths.append (rootindir + "/Indices/NASDAQ/")
    listOfInputPaths.append (rootindir + "/Indices/NYSE/")
    listOfInputPaths.append (rootindir + "/Indices/NYSE Arca/")
    listOfInputPaths.append (rootindir + "/Indices/PHLX/")
    listOfInputPaths.append (rootindir + "/Indices/Russell/")
    listOfInputPaths.append (rootindir + "/Indices/S&P/")
    listOfInputPaths.append (rootindir + "/Indices/S&P 500 Industries/")
    listOfInputPaths.append (rootindir + "/Indices/S&P 500 Industry Groups/")
    listOfInputPaths.append (rootindir + "/Indices/S&P 500 Sectors/")
    listOfInputPaths.append (rootindir + "/Indices/S&P 500 Sub-Industries/")
    listOfInputPaths.append (rootindir + "/Indices/S&P Technology/")
    listOfInputPaths.append (rootindir + "/Indices/S&P TSX/")
    listOfInputPaths.append (rootindir + "/Indices/Wilshire/")

    rootoutdir = rootdir + "/Processed/Norgate/Stocks/US"
    listOfOutputPaths= list()
    listOfOutputPaths.append(rootoutdir + "/AMEX/")
    listOfOutputPaths.append(rootoutdir + "/Delisted Securities/")
    listOfOutputPaths.append(rootoutdir + "/NASDAQ/")
    listOfOutputPaths.append(rootoutdir + "/NYSE/")
    listOfOutputPaths.append(rootoutdir + "/NYSE Arca/")
    listOfOutputPaths.append(rootoutdir + "/OTC/")    
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    
    #If the output paths don't exist, then create them...
    for path in listOfOutputPaths:
        if not (os.access(path, os.F_OK)):
            #Path does not exist, so create it
            os.makedirs(path)
    #done making all output paths!
    
    #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run
    utils.clean_paths (listOfOutputPaths)
    
    if (len(listOfInputPaths)!= len(listOfOutputPaths)):
        print("No. of input paths not equal to the number of output paths.. quitting")
        sys.exit("FAILURE")
        #if ends
    
    path_ctr = -1;
    use_cols = list(range(1, 7 + 1)) # will now use cols 1 to 7
    for path in listOfInputPaths:
        path_ctr =  path_ctr + 1;
        stocks_at_this_path = dircache.listdir(str(path))
        #Next, throw away everything that is not a .csv And these are our stocks! Example: this should throw away the '$' folder in the NYSE folder
        filtered_names= [x for x in stocks_at_this_path if (str(x).find(str(fileExtensionToRemove)) > -1)]
        #Now, we remove the .csv to get the name of the stock
        filtered_names = [(x.partition(str(fileExtensionToRemove))[0]) for x in filtered_names]
        stock_ctr = -1
        for stock in filtered_names:
            stock_ctr = stock_ctr + 1
            print("csv_to_pkl: processing: " + str (path + stock))
            #read in the stock date from the CSV file
            stock_data= np.loadtxt (path + stock+".csv", np.float, None, ",", None, 1, use_cols)
            stock_data_shape = stock_data.shape
            #print "stock_data_shape is: " + str(stock_data_shape)
            f = open (listOfOutputPaths[path_ctr] + filtered_names[stock_ctr] + ".pkl", "wb" )
            pkl.dump (stock_data, f, -1)
            f.close()
        #for stock in stocks_at_this_path ends
    #for path in listOfInputPaths ends
    print("Finished..."+ str(time.strftime("%H:%M:%S")))
def convert ():
    '''
    @summary: Converts a Compustat CSV file to pickle files of numpy arrays.
    '''
    
    print("Starting..."+ str(time.strftime("%H:%M:%S")))
    
    ''' Write every so often to save memory, 20k lines is usually < .5GB '''
    lSaveMem = 20000
    
    try:
        rootdir = os.environ['QSDATA']
    except KeyError:
        print("Please be sure to set the value for QSDATA in config.sh or local.sh\n")    
    
    ''' Create lists of input and output paths '''
    fFile = ( rootdir + "/Raw/Compustat/Compustat.csv")

    listOfOutputPaths= []
    listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/AMEX/")
    listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/NASDAQ/")
    listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/NYSE/")    
    
    #If the output paths don't exist, then create them...
    for path in listOfOutputPaths:
        if not (os.access(path, os.F_OK)):
            #Path does not exist, so create it
            os.makedirs(path) #Makes paths recursively
    #done making all output paths!
    
    #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run
    utils.clean_paths (listOfOutputPaths)
      
    spamReader = csv.reader(open(fFile, 'rb'), delimiter=',')
    ''' 1378625 rows typical '''    
    
    ''' Take first row as the labels '''
    for row in spamReader:
        lsLabels = row
        break
   
    ''' Generated from _Analyze() '''
    lsBadLabels = set(['LOC', 'ADD4', 'ADD3', 'ADD2', 'ADD1', 'ACCTCHGQ', 'WEBURL', 'IDBFLAG', 'popsrc', 'DATACQTR', 'conm', 'COSTAT', 'FINALQ', 'fdateq', 'FAX', 'RP', 'PRIROW', 'dldte', 'indfmt', 'SPCSRC', 'BUSDESC', 'ipodate', 'PHONE', 'CURCDQ', 'pdateq', 'DATAFQTR', 'PRICAN', 'EIN', 'datadate', 'tic', 'ADDZIP', 'CONML', 'consol', 'datafmt', 'cusip', 'BSPRQ', 'OGMQ', 'COMPSTQ', 'COUNTY', 'STATE', 'CURNCDQ', 'CITY', 'rdq', 'apdedateq', 'STALTQ', 'INCORP'])

    ''' get list of stocks in 3 US indexes '''
    Access = da.DataAccess( 'Norgate' )
    setNyse = set( Access.get_symbols_in_sublist("/US/NYSE") )
    setNasdaq = set( Access.get_symbols_in_sublist("/US/NASDAQ") )
    setAmex = set( Access.get_symbols_in_sublist("/US/AMEX") )
    
    ''' If stock appears in more than one index, remove to avoid ambiguity '''
    print('Ignoring duplicate stocks:', end=' ')
    dup1 =  setNyse.intersection( setNasdaq.union(setAmex))
    dup2 =  setNasdaq.intersection( setAmex )
    print(dup1.union(dup2))

    setNyse   = setNyse - dup1.union(dup2)
    setAmex   = setAmex - dup1.union(dup2)
    setNasdaq = setNasdaq - dup1.union(dup2)
    
    ''' Note the two lists below must be in the same order '''
    lsOutPaths = []
    lsOutPaths.append(rootdir + "/Processed/Compustat/US/AMEX/")
    lsOutPaths.append(rootdir + "/Processed/Compustat/US/NASDAQ/")
    lsOutPaths.append(rootdir + "/Processed/Compustat/US/NYSE/")  
    
    lSets = [setAmex, setNasdaq, setNyse]  
    
    #If the output paths don't exist, then create them...
    for path in lsOutPaths:
        if not (os.access(path, os.F_OK)):
            #Path does not exist, so create it
            os.makedirs(path) #Makes paths recursively
    #done making all output paths!
    
    #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run
    utils.clean_paths (lsOutPaths)
        
    lDateCol = 0
    llUseCols = []

    
    ''' We have first row (the labels), loop through saving good label indicies '''
    for j, sElem in enumerate(lsLabels):
        if( sElem not in lsBadLabels ):
            llUseCols.append(j)
        
        ''' Keep track of ticker column and date, specially handled later '''
        if( sElem == 'datadate' ):
            lDateCol = j 
        if( sElem == 'tic' ):
            lTicCol = j 
        
        
    ''' Dict of ticker->numpy array mapping '''
    dData = dict()
    print('')
    
    
    ''' Main loop, iterate over the rows in the csv file '''
    for j, row in enumerate(spamReader):
        lsLabels = row
        sTic = row[lTicCol]
        
        ''' Find out what index this belongs to '''
        lIndex = -1
        for i, symSet in enumerate( lSets ):
            if sTic in symSet:
                lIndex = i
                break
        if lIndex == -1:
            continue
        
        sFilename = lsOutPaths[lIndex] + sTic + '.pkl'
        
        ''' If the file exists (temporary memory saving measure), read it in and delete file from disk '''
        if( os.path.isfile(sFilename)):
            if sTic in dData:
               print('File should not be both on disk and in dict')
               sys.exit("FAILURE")
            
            fIn = open( sFilename, 'rb' )
            dData[sTic] = pkl.load( fIn )
            fIn.close()
            os.remove( sFilename )
            
        fDate = float( dt.datetime.strptime( row[lDateCol], "%m/%d/%Y").strftime("%Y%m%d") )
        
        ''' convert blanks to nans '''
        for i in llUseCols:
            if row[i] == '':
                row[i] = 'nan'
        
        ''' Add row if data exists, if not, create new array '''
        if sTic in dData:       
            dData[sTic] = np.vstack( (dData[sTic], np.array([fDate] + [row[i] for i in llUseCols], dtype=np.float)) )
        else:
            dData[sTic]= np.array( [fDate] + [row[i] for i in llUseCols], dtype=np.float )
        
        if( (j+1) % 1000 == 0):
            fDone = (j / 1378625.0) * 100
            print('\rApprox %.2lf%%'%((j / 1378625.0) * 100), end=' ')
            
        if( (j+1) % lSaveMem == 0):
            ''' Write all the pickle files we currently have '''
            
            print('\nWriting %i lines to pickle files to save memory\n'%(lSaveMem))
            _dumpFiles( dData, lSets, lsOutPaths)
            ''' Remember to delete! '''
            del dData
            dData = dict()
            
        # Done writing files
    # Done with main loop

    print('')
    print('Writing final pickle files\n')       
    _dumpFiles( dData, lSets, lsOutPaths)
    del dData
    
    print("Finished..."+ str(time.strftime("%H:%M:%S")))
    
    return
def main ():
    
    print "Starting..."+ str(time.strftime("%H:%M:%S"))
    
    try:
        rootdir = os.environ['QSDATA']
    except KeyError:
        #rootdir = "/hzr71/research/QSData"
        print "Please be sure to set the value for QSDATA in config.sh or local.sh\n"    
    
    fileExtensionToRemove = ".csv"
    
    listOfInputPaths= list()

    
#For Gekko
    rootindir = rootdir + "/Raw/Norgate/Stocks/CSV/US"
    listOfInputPaths.append (rootindir + "/AMEX/")
    listOfInputPaths.append (rootindir + "/Delisted Securities/")
    listOfInputPaths.append (rootindir + "/NASDAQ/")
    listOfInputPaths.append (rootindir + "/NYSE/")
    listOfInputPaths.append (rootindir + "/NYSE Arca/")
    listOfInputPaths.append (rootindir + "/OTC/")
    listOfInputPaths.append (rootindir + "/Indices/ADRs/")
    listOfInputPaths.append (rootindir + "/Indices/AMEX/")
    listOfInputPaths.append (rootindir + "/Indices/CBOE/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Americas/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Averages/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Broad Market/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Misc/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones Style/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Industries/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Sectors/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Subsectors/")
    listOfInputPaths.append (rootindir + "/Indices/Dow Jones US Supersectors/")
    listOfInputPaths.append (rootindir + "/Indices/ISE/")
    listOfInputPaths.append (rootindir + "/Indices/Merrill Lynch/")
    listOfInputPaths.append (rootindir + "/Indices/Misc/")
    listOfInputPaths.append (rootindir + "/Indices/Morgan Stanley/")
    listOfInputPaths.append (rootindir + "/Indices/NASDAQ/")
    listOfInputPaths.append (rootindir + "/Indices/NYSE/")
    listOfInputPaths.append (rootindir + "/Indices/NYSE Arca/")
    listOfInputPaths.append (rootindir + "/Indices/PHLX/")
    listOfInputPaths.append (rootindir + "/Indices/Russell/")
    listOfInputPaths.append (rootindir + "/Indices/S&P/")
    listOfInputPaths.append (rootindir + "/Indices/S&P 500 Industries/")
    listOfInputPaths.append (rootindir + "/Indices/S&P 500 Industry Groups/")
    listOfInputPaths.append (rootindir + "/Indices/S&P 500 Sectors/")
    listOfInputPaths.append (rootindir + "/Indices/S&P 500 Sub-Industries/")
    listOfInputPaths.append (rootindir + "/Indices/S&P Technology/")
    listOfInputPaths.append (rootindir + "/Indices/S&P TSX/")
    listOfInputPaths.append (rootindir + "/Indices/Wilshire/")

    rootoutdir = rootdir + "/Processed/Norgate/Stocks/US"
    listOfOutputPaths= list()
    listOfOutputPaths.append(rootoutdir + "/AMEX/")
    listOfOutputPaths.append(rootoutdir + "/Delisted Securities/")
    listOfOutputPaths.append(rootoutdir + "/NASDAQ/")
    listOfOutputPaths.append(rootoutdir + "/NYSE/")
    listOfOutputPaths.append(rootoutdir + "/NYSE Arca/")
    listOfOutputPaths.append(rootoutdir + "/OTC/")    
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    listOfOutputPaths.append(rootoutdir + "/Indices/")
    
    #If the output paths don't exist, then create them...
    for path in listOfOutputPaths:
        if not (os.access(path, os.F_OK)):
            #Path does not exist, so create it
            os.makedirs(path)
    #done making all output paths!
    
    #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run
    utils.clean_paths (listOfOutputPaths)
    
    if (len(listOfInputPaths)!= len(listOfOutputPaths)):
        print "No. of input paths not equal to the number of output paths.. quitting"
        sys.exit("FAILURE")
        #if ends
    
    path_ctr = -1;
    use_cols = range (1, 7 + 1) # will now use cols 1 to 7
    for path in listOfInputPaths:
        path_ctr =  path_ctr + 1;
        stocks_at_this_path = dircache.listdir(str(path))
        #Next, throw away everything that is not a .csv And these are our stocks! Example: this should throw away the '$' folder in the NYSE folder
        filtered_names= filter (lambda x:(str(x).find(str(fileExtensionToRemove)) > -1), stocks_at_this_path)
        #Now, we remove the .csv to get the name of the stock
        filtered_names = map(lambda x:(x.partition(str(fileExtensionToRemove))[0]),filtered_names)
        stock_ctr = -1
        for stock in filtered_names:
            stock_ctr = stock_ctr + 1
            print "csv_to_pkl: processing: " + str (path + stock)
            #read in the stock date from the CSV file
            stock_data= np.loadtxt (path + stock+".csv", np.float, None, ",", None, 1, use_cols)
            stock_data_shape = stock_data.shape
            #print "stock_data_shape is: " + str(stock_data_shape)
            f = open (listOfOutputPaths[path_ctr] + filtered_names[stock_ctr] + ".pkl", "wb" )
            pkl.dump (stock_data, f, -1)
            f.close()
        #for stock in stocks_at_this_path ends
    #for path in listOfInputPaths ends
    print "Finished..."+ str(time.strftime("%H:%M:%S"))
def get_data_for_exchange(exchange, data_path):

    # data_access= da.DataAccess('norgate')
    # symbol_list= data_access.get_all_symbols()
    data_path = data_path + "/Raw/Yahoo/US/" + str(exchange) + "/"

    # Create path if it doesn't exist
    if not (os.access(data_path, os.F_OK)):
        os.makedirs(data_path)

    utils.clean_paths(data_path)

    symbol_list = list()

    print "Getting list of stocks.."

    try:
        nasdaq_params = urllib.urlencode({"exchange": str(exchange), "render": "download"})
        nasdaq_get = urllib2.urlopen("http://www.nasdaq.com/screening/companies-by-name.aspx?%s" % nasdaq_params)
        symbol_list.append(
            nasdaq_get.readline()
        )  # Now we have all the data in a list- but we need only the symbols so we remove the rest
        while len(symbol_list[-1]) > 0:
            symbol_list.append(nasdaq_get.readline())
            # while ends
        symbol_list.pop(0)  # This is just the word "symbol" and not a symbol itself
        symbol_list.pop(-1)  # Remove the last element because its going to be blank anyway
        # symbol_list = map(lambda x:(x.partition(str(","))[0]),symbol_list) #Get the stuff before the first comma- which is the symbol

        # Unfortunately this symbol is in quotes. So we have to remove them now
        symbol_list = map(lambda x: (x.partition(str('"'))[2]), symbol_list)  # Keep the stuff only after the first "
        symbol_list = map(lambda x: (x.partition(str('"'))[0]), symbol_list)  # Keep the stuff before the second "

    except urllib2.HTTPError:
        print "Unable to get list of stocks from server. Please check your internet connection and retry."
    except:
        print "Unknown error occoured when getting list of stocks from server."

    print "Got " + str(len(symbol_list)) + " symbols. Now getting symbol data..."

    _now = datetime.datetime.now()
    miss_ctr = 0
    # Counts how many symbols we could get
    for symbol in symbol_list:
        symbol_data = list()
        print "Getting " + str(symbol)

        try:
            params = urllib.urlencode(
                {"a": 03, "b": 12, "c": 2000, "d": _now.month, "e": _now.day, "f": _now.year, "s": str(symbol)}
            )
            url_get = urllib2.urlopen("http://ichart.finance.yahoo.com/table.csv?%s" % params)

            header = url_get.readline()
            symbol_data.append(url_get.readline())
            while len(symbol_data[-1]) > 0:
                symbol_data.append(url_get.readline())
            #                print str(symbol_data[-1])

            symbol_data.pop(
                -1
            )  # The last element is going to be the string of length zero. We don't want to write that to file.

            # To change adjusted close so that Yahoo data is same as Norgate data
            symbol_data = map(adjust, symbol_data)

            # Following changes so that the data looks like Norgate data and the change to cav_to_pkl.csv is minimized
            symbol_data = map(lambda x: (x.replace("-", "")), symbol_data)
            symbol_data = map(
                lambda x: (str(symbol) + "," + str(x)), symbol_data
            )  # This means that the header is wrong but since it is ignored later anyways- this will work

            # now writing data to file
            f = open(data_path + symbol + ".csv", "w")

            # Writing the header
            f.write(header)

            while len(symbol_data) > 0:
                f.write(symbol_data.pop())

            f.close()
Пример #6
0
def main ():
    
    print "Starting..."+ str(time.strftime("%H:%M:%S"))
    
    try:
        rootdir = os.environ['QSDATA']
    except KeyError:
        #rootdir = "/hzr71/research/QSData"
        print "Please be sure to set the value for QSDATA in config.sh or local.sh\n"    
    
    fileExtensionToRemove = ".csv"
    
    listOfInputPaths= list()

    
#For Gekko
    #listOfInputPaths.append("/hzr71/research/QSData/Processed/Norgate/raw/Delisted Securities/US Recent/")
    listOfInputPaths.append (rootdir + "/Raw/Yahoo/US/AMEX/")
    listOfInputPaths.append (rootdir + "/Raw/Yahoo/US/NASDAQ/")
    listOfInputPaths.append (rootdir + "/Raw/Yahoo/US/NYSE/")
    
    
    
    listOfOutputPaths= list()
#    listOfOutputPaths.append("C:\\test\\temp\\pkl1\\")
#    listOfOutputPaths.append("C:\\test\\temp\\pkl2\\")    
    
    #listOfOutputPaths.append(rootdir + "/Norgate/Delisted Securities/US Recent/")
    listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/AMEX/")
    listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/NASDAQ/")
    listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/NYSE/")    
    
    #If the output paths don't exist, then create them...
    for path in listOfOutputPaths:
        if not (os.access(path, os.F_OK)):
            #Path does not exist, so create it
            os.makedirs(path) #Makes paths recursively
    #done making all output paths!
    
    #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run
    utils.clean_paths (listOfOutputPaths)
    
    
    if (len(listOfInputPaths)!= len(listOfOutputPaths)):
        print "No. of input paths not equal to the number of output paths.. quitting"
        sys.exit("FAILURE")
        #if ends
    
    path_ctr = -1;
    use_cols = range (1, 7 + 1) # will now use cols 1 to 7
    for path in listOfInputPaths:
        path_ctr =  path_ctr + 1;
        stocks_at_this_path = dircache.listdir(str(path))
        #Next, throw away everything that is not a .csv And these are our stocks! Example: this should throw away the '$' folder in the NYSE folder
        filtered_names= filter (lambda x:(str(x).find(str(fileExtensionToRemove)) > -1), stocks_at_this_path)
        #Now, we remove the .csv to get the name of the stock
        filtered_names = map(lambda x:(x.partition(str(fileExtensionToRemove))[0]),filtered_names)
        stock_ctr = -1
        for stock in filtered_names:
            stock_ctr = stock_ctr + 1
            print "Reading file: " + str (path + stock)
            #read in the stock date from the CSV file
            stock_data= np.loadtxt (path + stock+".csv", np.float, None, ",", None, 1, use_cols)
            
            stock_data_shape = stock_data.shape
            #print "stock_data_shape is: " + str(stock_data_shape)
        
            
#            for i in range (0, stock_data_shape[0]):
#                print stock_data [i,: ]
            
#            print "Reading: " + str(stock)
            f = open (listOfOutputPaths[path_ctr] + filtered_names[stock_ctr] + ".pkl", "wb" )
            pkl.dump (stock_data, f, -1)
            f.close()
        #for stock in stocks_at_this_path ends
    #for path in listOfInputPaths ends
    print "Finished..."+ str(time.strftime("%H:%M:%S"))
Пример #7
0
def main():

    print "Starting..." + str(time.strftime("%H:%M:%S"))

    try:
        rootdir = os.environ['QSDATA']
    except KeyError:
        #rootdir = "/hzr71/research/QSData"
        print "Please be sure to set the value for QSDATA in config.sh or local.sh\n"

    fileExtensionToRemove = ".csv"

    listOfInputPaths = list()

    #For Gekko
    #listOfInputPaths.append("/hzr71/research/QSData/Processed/Norgate/raw/Delisted Securities/US Recent/")
    listOfInputPaths.append(rootdir + "/Raw/Yahoo/US/AMEX/")
    listOfInputPaths.append(rootdir + "/Raw/Yahoo/US/NASDAQ/")
    listOfInputPaths.append(rootdir + "/Raw/Yahoo/US/NYSE/")

    listOfOutputPaths = list()
    #    listOfOutputPaths.append("C:\\test\\temp\\pkl1\\")
    #    listOfOutputPaths.append("C:\\test\\temp\\pkl2\\")

    #listOfOutputPaths.append(rootdir + "/Norgate/Delisted Securities/US Recent/")
    listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/AMEX/")
    listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/NASDAQ/")
    listOfOutputPaths.append(rootdir + "/Processed/Yahoo/US/NYSE/")

    #If the output paths don't exist, then create them...
    for path in listOfOutputPaths:
        if not (os.access(path, os.F_OK)):
            #Path does not exist, so create it
            os.makedirs(path)  #Makes paths recursively
    #done making all output paths!

    #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run
    utils.clean_paths(listOfOutputPaths)

    if (len(listOfInputPaths) != len(listOfOutputPaths)):
        print "No. of input paths not equal to the number of output paths.. quitting"
        sys.exit("FAILURE")
        #if ends

    path_ctr = -1
    use_cols = range(1, 7 + 1)  # will now use cols 1 to 7
    for path in listOfInputPaths:
        path_ctr = path_ctr + 1
        stocks_at_this_path = dircache.listdir(str(path))
        #Next, throw away everything that is not a .csv And these are our stocks! Example: this should throw away the '$' folder in the NYSE folder
        filtered_names = filter(
            lambda x: (str(x).find(str(fileExtensionToRemove)) > -1),
            stocks_at_this_path)
        #Now, we remove the .csv to get the name of the stock
        filtered_names = map(
            lambda x: (x.partition(str(fileExtensionToRemove))[0]),
            filtered_names)
        stock_ctr = -1
        for stock in filtered_names:
            stock_ctr = stock_ctr + 1
            print "Reading file: " + str(path + stock)
            #read in the stock date from the CSV file
            stock_data = np.loadtxt(path + stock + ".csv", np.float, None, ",",
                                    None, 1, use_cols)

            stock_data_shape = stock_data.shape
            #print "stock_data_shape is: " + str(stock_data_shape)

            #            for i in range (0, stock_data_shape[0]):
            #                print stock_data [i,: ]

            #            print "Reading: " + str(stock)
            f = open(
                listOfOutputPaths[path_ctr] + filtered_names[stock_ctr] +
                ".pkl", "wb")
            pkl.dump(stock_data, f, -1)
            f.close()
        #for stock in stocks_at_this_path ends
    #for path in listOfInputPaths ends
    print "Finished..." + str(time.strftime("%H:%M:%S"))
def convert ():
    '''
    @summary: Converts a Compustat CSV file to pickle files of numpy arrays.
    '''
    
    print "Starting..."+ str(time.strftime("%H:%M:%S"))
    
    ''' Write every so often to save memory, 20k lines is usually < .5GB '''
    lSaveMem = 20000
    
    try:
        rootdir = os.environ['QSDATA']
    except KeyError:
        print "Please be sure to set the value for QSDATA in config.sh or local.sh\n"    
    
    ''' Create lists of input and output paths '''
    fFile = ( rootdir + "/Raw/Compustat/Compustat.csv")

    listOfOutputPaths= []
    listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/AMEX/")
    listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/NASDAQ/")
    listOfOutputPaths.append(rootdir + "/Processed/Compustat/US/NYSE/")    
    
    #If the output paths don't exist, then create them...
    for path in listOfOutputPaths:
        if not (os.access(path, os.F_OK)):
            #Path does not exist, so create it
            os.makedirs(path) #Makes paths recursively
    #done making all output paths!
    
    #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run
    utils.clean_paths (listOfOutputPaths)
      
    spamReader = csv.reader(open(fFile, 'rb'), delimiter=',')
    ''' 1378625 rows typical '''    
    
    ''' Take first row as the labels '''
    for row in spamReader:
        lsLabels = row
        break
   
    ''' Generated from _Analyze() '''
    lsBadLabels = set(['LOC', 'ADD4', 'ADD3', 'ADD2', 'ADD1', 'ACCTCHGQ', 'WEBURL', 'IDBFLAG', 'popsrc', 'DATACQTR', 'conm', 'COSTAT', 'FINALQ', 'fdateq', 'FAX', 'RP', 'PRIROW', 'dldte', 'indfmt', 'SPCSRC', 'BUSDESC', 'ipodate', 'PHONE', 'CURCDQ', 'pdateq', 'DATAFQTR', 'PRICAN', 'EIN', 'datadate', 'tic', 'ADDZIP', 'CONML', 'consol', 'datafmt', 'cusip', 'BSPRQ', 'OGMQ', 'COMPSTQ', 'COUNTY', 'STATE', 'CURNCDQ', 'CITY', 'rdq', 'apdedateq', 'STALTQ', 'INCORP'])

    ''' get list of stocks in 3 US indexes '''
    Access = da.DataAccess( 'Norgate' )
    setNyse = set( Access.get_symbols_in_sublist("/US/NYSE") )
    setNasdaq = set( Access.get_symbols_in_sublist("/US/NASDAQ") )
    setAmex = set( Access.get_symbols_in_sublist("/US/AMEX") )
    
    ''' If stock appears in more than one index, remove to avoid ambiguity '''
    print 'Ignoring duplicate stocks:',
    dup1 =  setNyse.intersection( setNasdaq.union(setAmex))
    dup2 =  setNasdaq.intersection( setAmex )
    print dup1.union(dup2)

    setNyse   = setNyse - dup1.union(dup2)
    setAmex   = setAmex - dup1.union(dup2)
    setNasdaq = setNasdaq - dup1.union(dup2)
    
    ''' Note the two lists below must be in the same order '''
    lsOutPaths = []
    lsOutPaths.append(rootdir + "/Processed/Compustat/US/AMEX/")
    lsOutPaths.append(rootdir + "/Processed/Compustat/US/NASDAQ/")
    lsOutPaths.append(rootdir + "/Processed/Compustat/US/NYSE/")  
    
    lSets = [setAmex, setNasdaq, setNyse]  
    
    #If the output paths don't exist, then create them...
    for path in lsOutPaths:
        if not (os.access(path, os.F_OK)):
            #Path does not exist, so create it
            os.makedirs(path) #Makes paths recursively
    #done making all output paths!
    
    #In case there are already some files there- remove them. This will remove all the pkl fils from the previous run
    utils.clean_paths (lsOutPaths)
        
    lDateCol = 0
    llUseCols = []

    
    ''' We have first row (the labels), loop through saving good label indicies '''
    for j, sElem in enumerate(lsLabels):
        if( sElem not in lsBadLabels ):
            llUseCols.append(j)
        
        ''' Keep track of ticker column and date, specially handled later '''
        if( sElem == 'datadate' ):
            lDateCol = j 
        if( sElem == 'tic' ):
            lTicCol = j 
        
        
    ''' Dict of ticker->numpy array mapping '''
    dData = dict()
    print ''
    
    
    ''' Main loop, iterate over the rows in the csv file '''
    for j, row in enumerate(spamReader):
        lsLabels = row
        sTic = row[lTicCol]
        
        ''' Find out what index this belongs to '''
        lIndex = -1
        for i, symSet in enumerate( lSets ):
            if sTic in symSet:
                lIndex = i
                break
        if lIndex == -1:
            continue
        
        sFilename = lsOutPaths[lIndex] + sTic + '.pkl'
        
        ''' If the file exists (temporary memory saving measure), read it in and delete file from disk '''
        if( os.path.isfile(sFilename)):
            if dData.has_key(sTic):
               print 'File should not be both on disk and in dict'
               sys.exit("FAILURE")
            
            fIn = open( sFilename, 'rb' )
            dData[sTic] = pkl.load( fIn )
            fIn.close()
            os.remove( sFilename )
            
        fDate = float( dt.datetime.strptime( row[lDateCol], "%m/%d/%Y").strftime("%Y%m%d") )
        
        ''' convert blanks to nans '''
        for i in llUseCols:
            if row[i] == '':
                row[i] = 'nan'
        
        ''' Add row if data exists, if not, create new array '''
        if dData.has_key(sTic):       
            dData[sTic] = np.vstack( (dData[sTic], np.array([fDate] + [row[i] for i in llUseCols], dtype=np.float)) )
        else:
            dData[sTic]= np.array( [fDate] + [row[i] for i in llUseCols], dtype=np.float )
        
        if( (j+1) % 1000 == 0):
            fDone = (j / 1378625.0) * 100
            print '\rApprox %.2lf%%'%((j / 1378625.0) * 100),
            
        if( (j+1) % lSaveMem == 0):
            ''' Write all the pickle files we currently have '''
            
            print '\nWriting %i lines to pickle files to save memory\n'%(lSaveMem)
            _dumpFiles( dData, lSets, lsOutPaths)
            ''' Remember to delete! '''
            del dData
            dData = dict()
            
        # Done writing files
    # Done with main loop

    print ''
    print 'Writing final pickle files\n'       
    _dumpFiles( dData, lSets, lsOutPaths)
    del dData
    
    print "Finished..."+ str(time.strftime("%H:%M:%S"))
    
    return