Пример #1
0
def scrape_all ():
    """
    Scrape the downloaded Yahoo Finance pages for detailed fund data.
    """
    import time, os
    import psycopg2.extras
    from modules import db 
    from modules import common
    print "*********************************************"
    print "Scraping the detailed data on all stock funds"
    
    # Get list of symbols
    conn = db.connect (60) # Start database connection
    cur1 = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    list_symbols = db.get_symbols (cur1)
    conn.close () # Close database connection
    
    conn2 = db.connect (1800) # Start database connection
    cur2 = conn2.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

    i = 0 # Number of funds completed
    i_max = len (list_symbols) # Total number of funds
    start = time.time ()
    
    for symbol in list_symbols:
        try:
            scrape_fund (conn2, cur2, symbol)
        except:
            pass
        i += 1
        
        now = time.time ()
        t_elapsed = now - start
        try:
            rate_s = i / t_elapsed # Stocks/second
            remain_s = (i_max - i)/rate_s
            remain_m = round(remain_s/60, 1)
            if i == 10 or i % 100 == 0:
                print "Scraping completion: " + str(i) + '/' + str(i_max)
                print "Minutes remaining: " + str(remain_m)
        except:
            pass
    
    conn2.close () # Close database connection
    
    now = time.time ()
    t_elapsed_sec = now - start
    t_elapsed_min = t_elapsed_sec/60
    print "Finished scraping the detailed stock fund data"
    print "Completed in " + str(t_elapsed_min) + " minutes"
    print "**********************************************"
Пример #2
0
def download ():
    """
    Download detailed data on funds from Yahoo Finance
    """
    import time, os
    import psycopg2.extras
    from modules import db 
    from modules import common
    
    dir_output = os.environ ['BSF_OUTPUT']
    file_csv_output = dir_output + '/fund_unfiltered.csv'
    conn = db.connect (60) # Start database connection
    cur2 = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    print "Creating unfiltered list of funds at:"
    print file_csv_output
    db.print_csv (conn, cur2, file_csv_output)
    conn.close () # Close database connection
    
    print "Filtering the list of funds"
    conn = db.connect (60) # Start database connection
    cur1 = conn.cursor ()
    
    
    # Filter the list of funds
    db.filter_by_fundtype (conn, cur1)
    db.filter_by_obj (conn, cur1)
    db.filter_by_name (conn, cur1)
    
    db.renumber (conn, cur1) # Reset ID numbers, DOES NOT WORK
    
    conn.close () # Close database connection
    
    print "************************************************"
    print "Downloading the detailed data on all stock funds"
    print "NOTE: This may be a VERY long process."
    
    # Get list of symbols
    conn = db.connect (60) # Start database connection
    cur2 = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    list_symbols = db.get_symbols (cur2)
    conn.close () # Close database connection
    
    i = 0 # Number of funds completed
    i_max = len (list_symbols) # Total number of funds
    start = time.time ()
    
    for symbol in list_symbols:
        
        dir_detailed = os.environ ['BSF_DETAILED']
        common.create_dir (dir_detailed + '/' + symbol)
		
        url1 = 'http://finance.yahoo.com/q/pr?s=' + symbol + '+Profile' 
        url2 = 'http://finance.yahoo.com/q/hl?s=' + symbol + '+Holdings'
        url3 = 'http://finance.yahoo.com/d/quotes.csv?s=' + symbol + '&f=l1'
		
        file1 = dir_detailed + '/' + symbol + '/profile.html'
        file2 = dir_detailed + '/' + symbol + '/holdings.html'
        file3 = dir_detailed + '/' + symbol + '/quote.csv'
        
        common.download_file (url1, file1, 164, .2)
        common.download_file (url2, file2, 164, .2)
        common.download_file (url3, file3, 20, .002)
        
        i += 1
        now = time.time ()
        t_elapsed = now - start
        try:
            rate_s = i / t_elapsed # Stocks/second
            remain_s = (i_max - i)/rate_s
            remain_m = round(remain_s/60, 1)
            if i == 10 or i % 100 == 0:
                print "Download completion: " + str(i) + '/' + str(i_max)
                print "Minutes remaining: " + str(remain_m)
        except:
            pass
    print "Finished downloading detailed data on stock funds"
    print "*************************************************"