def scrape_all (): """ Scrape the downloaded Yahoo Finance pages for detailed fund data. """ import time, os import psycopg2.extras from modules import db from modules import common print "*********************************************" print "Scraping the detailed data on all stock funds" # Get list of symbols conn = db.connect (60) # Start database connection cur1 = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) list_symbols = db.get_symbols (cur1) conn.close () # Close database connection conn2 = db.connect (1800) # Start database connection cur2 = conn2.cursor(cursor_factory=psycopg2.extras.RealDictCursor) i = 0 # Number of funds completed i_max = len (list_symbols) # Total number of funds start = time.time () for symbol in list_symbols: try: scrape_fund (conn2, cur2, symbol) except: pass i += 1 now = time.time () t_elapsed = now - start try: rate_s = i / t_elapsed # Stocks/second remain_s = (i_max - i)/rate_s remain_m = round(remain_s/60, 1) if i == 10 or i % 100 == 0: print "Scraping completion: " + str(i) + '/' + str(i_max) print "Minutes remaining: " + str(remain_m) except: pass conn2.close () # Close database connection now = time.time () t_elapsed_sec = now - start t_elapsed_min = t_elapsed_sec/60 print "Finished scraping the detailed stock fund data" print "Completed in " + str(t_elapsed_min) + " minutes" print "**********************************************"
def download (): """ Download detailed data on funds from Yahoo Finance """ import time, os import psycopg2.extras from modules import db from modules import common dir_output = os.environ ['BSF_OUTPUT'] file_csv_output = dir_output + '/fund_unfiltered.csv' conn = db.connect (60) # Start database connection cur2 = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) print "Creating unfiltered list of funds at:" print file_csv_output db.print_csv (conn, cur2, file_csv_output) conn.close () # Close database connection print "Filtering the list of funds" conn = db.connect (60) # Start database connection cur1 = conn.cursor () # Filter the list of funds db.filter_by_fundtype (conn, cur1) db.filter_by_obj (conn, cur1) db.filter_by_name (conn, cur1) db.renumber (conn, cur1) # Reset ID numbers, DOES NOT WORK conn.close () # Close database connection print "************************************************" print "Downloading the detailed data on all stock funds" print "NOTE: This may be a VERY long process." # Get list of symbols conn = db.connect (60) # Start database connection cur2 = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) list_symbols = db.get_symbols (cur2) conn.close () # Close database connection i = 0 # Number of funds completed i_max = len (list_symbols) # Total number of funds start = time.time () for symbol in list_symbols: dir_detailed = os.environ ['BSF_DETAILED'] common.create_dir (dir_detailed + '/' + symbol) url1 = 'http://finance.yahoo.com/q/pr?s=' + symbol + '+Profile' url2 = 'http://finance.yahoo.com/q/hl?s=' + symbol + '+Holdings' url3 = 'http://finance.yahoo.com/d/quotes.csv?s=' + symbol + '&f=l1' file1 = dir_detailed + '/' + symbol + '/profile.html' file2 = dir_detailed + '/' + symbol + '/holdings.html' file3 = dir_detailed + '/' + symbol + '/quote.csv' common.download_file (url1, file1, 164, .2) common.download_file (url2, file2, 164, .2) common.download_file (url3, file3, 20, .002) i += 1 now = time.time () t_elapsed = now - start try: rate_s = i / t_elapsed # Stocks/second remain_s = (i_max - i)/rate_s remain_m = round(remain_s/60, 1) if i == 10 or i % 100 == 0: print "Download completion: " + str(i) + '/' + str(i_max) print "Minutes remaining: " + str(remain_m) except: pass print "Finished downloading detailed data on stock funds" print "*************************************************"