def get_stock_summary(stockSoup): """ Gets the stock summary information from the company summary page including Name, Price<br>, Market Cap, Price Earnings Ratio, Price Change, Ticker, Earnings per Share, Net Tangible Assets, Net DPS, Gross DPS, Beta Value, Price/NTA, Net Yield, Gross Yield, Sharpe Ratio Args: stockSoup (BeautifulSoup): The parsed page source of the summary page Returns: summaryDict (Dict): A dictionary which contains all the information captured on this page """ summaryDict = {} summaryDict["Name"] = (stockSoup.find('h1').text).split(' -')[0] summaryDict["Price"] = float( stockSoup.find( 'td', text='Market Price').find_next_sibling('td').text.split('$')[1]) summaryDict["Market Cap"] = float( stockSoup.find('td', text='Marketcap').find_next_sibling( 'td').text.split('$')[1].replace(',', '')) try: summaryDict["Price Change"] = float( stockSoup.find( 'td', text='Price Change').find_next('td').text.split('$')[1]) except IndexError: summaryDict["Price Change"] = float(0) summaryDict["Ticker"] = stockSoup.find( 'td', text='Ticker').find_next_sibling('td').text logger.debug(summaryDict) return summaryDict
def get_stock_historical_prices(stockHistoricalPricesCSV): """ Reads in the csv and outputs a dictionary for storage in the Stock class Args: stockHistoricalPricesCSV (String): Location where file is located Returns: (Dict): dictionary of historical prices """ logger.debug(pandas.read_csv(stockHistoricalPricesCSV)) return pandas.read_csv(stockHistoricalPricesCSV).to_dict('r')
def get_stock_historical_dividends(stockHistoricalDividendsCSV) : logger.debug(pandas.read_csv(stockHistoricalDividendsCSV)) dividendDF = pandas.read_csv(stockHistoricalDividendsCSV) dividendDF = dividendDF.dropna() try: dividendDF = dividendDF[['Ex Date', 'Gross Amount']] dividendDF.columns = ['Date', 'Dividend Paid'] dividendDF = dividendDF[dividendDF['Dividend Paid'] != '-'] return dividendDF.to_dict('r') except: logger.warning("No dividend information") return None
def get_stock_summary(stockSoup) : """ Gets the stock summary information including - Name - Price - Market Cap - Price Earnings Ratio - Price Change - Ticker - Earnings per Share - Net Tangible Assets - Net DPS - Gross DPS - Beta Value - Price/NTA - Net Yield - Gross Yield - Sharpe Ratio Args: param1 (BeautifulSoup): The parsed page source of the summary page. Returns: dict: A dictionary which contains all the information captured on this page """ summaryDict = {} summaryDict["Name"] = (stockSoup.find('h1').text).split(' -')[0] summaryDict["Price"] = stockSoup.find('td', text= 'Market Price').find_next_sibling('td').text summaryDict["Market Cap"] = stockSoup.find('td', text= 'Marketcap').find_next_sibling('td').text summaryDict["Price Earnings Ratio"] = stockSoup.find('td', text= 'P/E ratio').find_next_sibling('td').text summaryDict["Price Change"] = stockSoup.find('td', text= 'Price Change').find_next('td').text summaryDict["Ticker"] = stockSoup.find('td', text= 'Ticker').find_next_sibling('td').text summaryDict["EPS"] = stockSoup.find('td', text= 'EPS').find_next('td').text summaryDict["NTA"] = stockSoup.find('td', text= 'NTA').find_next_sibling('td').text summaryDict["Net DPS"] = stockSoup.find('td', text= 'Net DPS').find_next_sibling('td').text summaryDict["Gross DPS"] = stockSoup.find('td', text= 'Gross DPS').find_next_sibling('td').text summaryDict["Beta Value"] = stockSoup.find('td', text= 'Beta Value').find_next_sibling('td').text summaryDict["Price/NTA"] = stockSoup.find('td', text= 'Price/NTA').find_next_sibling('td').text summaryDict["Net Yield"] = stockSoup.find('td', text= 'Net Yield').find_next_sibling('td').text summaryDict["Gross Yield"] = stockSoup.find('td', text= 'Gross Yield').find_next_sibling('td').text summaryDict["Sharpe Ratio"] = stockSoup.find('td', text= 'Sharpe Ratio').find_next_sibling('td').text logger.debug(summaryDict) return summaryDict
def get_stock_historical_prices(stockHistoricalPricesCSV): """ Reads in the csv and outputs a dictionary for storage in the Stock class Args: stockHistoricalPricesCSV (String): Location where file is located Returns: (Dict): dictionary of historical prices """ with warnings.catch_warnings(): warnings.simplefilter(action='ignore', category=FutureWarning) prices = pandas.read_csv(stockHistoricalPricesCSV).to_dict('r') pricesReturn = [] logger.debug(prices) for price in prices: price['Dollar Value Traded'] = price.pop('$ Value Traded') pricesReturn.append(price) return pricesReturn
def get_stock_historical_dividends(stockHistoricalDividendsCSV): """ Reads in the csv and outputs a dictionary for storage in the Stock class Args: stockHistoricalDividendsCSV (String): Location where file is located Returns: (Dict): dictionary of historical dividends """ logger.debug(pandas.read_csv(stockHistoricalDividendsCSV)) dividendDF = pandas.read_csv(stockHistoricalDividendsCSV) dividendDF = dividendDF.dropna() try: dividendDF = dividendDF[['Ex Date', 'Gross Amount']] dividendDF.columns = ['Date', 'Dividend Paid'] dividendDF = dividendDF[dividendDF['Dividend Paid'] != '-'] return dividendDF.to_dict('r') except: logger.warning("No dividend information") return None
def print_financial_profile_sheet(workbook, stock, formats): logger.info(" Printing Financial Profile for " + stock.stockSummaryDict["Ticker"]) row = 0 col = 0 # Create sheet worksheet = workbook.add_worksheet(stock.stockSummaryDict["Ticker"] + "_FinancialProfile") # Print Headers & Values logger.debug(stock.stockFinancialProfileDict.items()) for key, value in stock.stockFinancialProfileDict.items(): worksheet.write_string(row, col, key) worksheet.write_string(row, col + 1, value) row += 1 worksheet.write_url(0, 13, "internal:" + stock.stockSummaryDict["Ticker"] + "_Summary!A1", string="BACK")
def print_historical_dividends_sheet(workbook, stock, formats): if stock.stockHistoricalDividendsDict is not None: logger.info(" Printing Historical Dividends for " + stock.stockSummaryDict["Ticker"]) row = 0 col = 0 # Create sheet worksheet = workbook.add_worksheet(stock.stockSummaryDict["Ticker"] + "_HistoricalDividends") # Print Headers keys = stock.stockHistoricalDividendsDict[0].keys() logger.debug(stock.stockHistoricalDividendsDict[0].keys()) for key in keys: worksheet.write_string(row, col, key) col += 1 worksheet.write_url(row, col + 13, "internal:" + stock.stockSummaryDict["Ticker"] + "_Summary!A1", string="BACK") row = 1 col = 0 # Print Items for rowItems in stock.stockHistoricalDividendsDict: logger.debug(rowItems) for key, value in rowItems.items(): logger.debug(value) if (key == 'Date'): worksheet.write_datetime( row, col, datetime.strptime(value, '%d %b %Y'), formats['dateFormat']) else: worksheet.write_number(row, col, float(value)) col += 1 row += 1 col = 0
def scrape_company(browser, stock): """ Contains the logic behind the scraping of an entire company's data Navigating to pages, downloading files Args: browser (Selenium.WebDriver): The automated Chrome browser stock (String): The stock ticker currently being scraped Returns: stockData (Stock): Class containing dictionaries of data """ logger.info("Current Stock: " + stock) # Arrive at Summary & Ratios page and pull information browser.find_element_by_link_text(stock).click() summarySoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling ratio information") stockSummaryDict = get_stock_summary(summarySoup) # Arrive at Company Directory and pull directors information browser.find_element_by_xpath( ".//span[contains(text(), 'Company Directory')]").click() directorSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling Director's information") stockDirectorDict = get_director_information(directorSoup) browser.execute_script("window.history.go(-1)") # Go back to summary page # Arrive at Company Profile and pull description information browser.find_element_by_xpath( ".//span[contains(text(), 'Company Profile')]").click() profileSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling company description") stockProfileDict = get_company_profile(profileSoup) logger.debug(stockProfileDict) browser.execute_script("window.history.go(-1)") # Go back to summary page # Arrive at Financial Profile and pull debt-equity information browser.find_element_by_xpath( ".//span[contains(text(), 'Financial Profile')]").click() stockSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling financial profile information") stockFinancialProfileDict = get_financial_profile(stockSoup) browser.execute_script("window.history.go(-1)") # Go back to summary page # Arrive at Annual Reports and pull latest annual report # ? May require refactor of xpath to shorten it (Looks nicer) # TODO change dl directory outside temp browser.find_element_by_xpath( ".//span[contains(text(), 'Annual Reports')]").click() logger.info("Pulling annual report") browser.find_element_by_xpath( r"""//*[@id="content"]/center/table/tbody/tr[3]/td/table/tbody/tr[2]/ td[2]/table/tbody/tr/td/table[2]/tbody/tr[1]/td[1]/table/tbody/ tr[1]/td[2]/form/input""").click() # sleep(10) browser.execute_script("window.history.go(-1)") # Go back to summary page # Arrive at Tear Sheet and pull latest tear sheet browser.find_element_by_xpath( ".//span[contains(text(), 'Tear Sheet')]").click() logger.info("Pulling tear sheet") browser.find_element_by_xpath( r"""//*[@id="content"]/center/table/tbody/tr[3]/td/table/tbody/tr[2]/td[2]/table/tbody/tr/td/table/tbody/tr/td/p[2]/a""" ).click() # sleep(10) browser.execute_script("window.history.go(-1)") # Go back # Create csv link for historical prices and pull it into a temporary folder csvLink = create_historical_prices_csv_link(stockSummaryDict["Ticker"]) logger.info("Pulling historical prices information") browser.get(csvLink) sleep(3) stockHistoricalPricesDict = get_stock_historical_prices( tempDirectory + stockSummaryDict["Ticker"] + " Historical Prices.csv") # Create csv link for dividends and pull it into a temporary folder csvLink = create_historical_dividends_csv_link(stockSummaryDict["Ticker"]) logger.info("Pulling historical dividends information") browser.get(csvLink) sleep(3) stockHistoricalDividendsDict = get_stock_historical_dividends( tempDirectory + stockSummaryDict["Ticker"] + " Historical Dividends.csv") # Go back to the stock ticker page logger.info("Back to company listings") browser.execute_script("window.history.go(-1)") # Create the stock obj and store it in an array stockData = Stock(stockSummaryDict, stockHistoricalPricesDict, stockHistoricalDividendsDict, stockFinancialProfileDict, stockProfileDict, stockDirectorDict) return stockData
def save_data(stockDataArray, success): """ Constructs a dictionary of company information. Converts it JSON, and sends it externally using send_to_server() Args: stockDataArray (List): dictionary of all company information success (Boolean): To indicate whether the scraping was succesful, to identify if processing needs to occur """ currentTimeStamp = datetime.now().strftime('%Y/%m/%d') scrapeInsert = {currentTimeStamp: {'Date': currentTimeStamp}} if success: logger.info("Saving data") print("Saving data") dividendInsert = {'Data': {}, 'Name': 'HistoricalDividends'} priceInsert = {'Data': {}, 'Name': 'HistoricalPrices'} stockIteration = 0 # Select stock for stock in stockDataArray: currentStockTicker = stock['Summary']['Ticker'] logger.info("Saving data for: " + currentStockTicker) stockInsert = {} # Create stock dict from scraped data for sectionKey, sectionData in stock.items(): logger.info(sectionKey) sectionInsert = {} if sectionKey == 'HistoricalPrices': for line in sectionData: logger.debug(line) dateString = line.pop('Date') dateString = (datetime.strptime( dateString, '%d %b %Y')).strftime("%Y-%m-%d") sectionInsert[dateString] = line stockInsert[sectionKey] = sectionInsert elif sectionKey == 'HistoricalDividends': try: for line in sectionData: logger.debug(line) dateString = line.pop('Date') dateString = (datetime.strptime( dateString, '%d %b %Y')).strftime("%Y-%m-%d") sectionInsert[dateString] = line.pop( 'Dividend Paid') stockInsert[sectionKey] = sectionInsert except TypeError: pass else: for elementKey, elementValue in sectionData.items(): sectionInsert[elementKey] = elementValue stockInsert[sectionKey] = sectionInsert scrapeInsert[currentTimeStamp][stock['Summary'] ['Ticker']] = stockInsert stockIteration += 1 printProgressBar( stockIteration, len(stockDataArray), prefix='Saving {} data'.format(stock['Summary']['Ticker']), suffix='of {} companies completed'.format(len(stockDataArray))) with open('data.txt', 'w') as outfile: json.dump(scrapeInsert, outfile, indent=4) # save_result_to_pastebin(scrapeInsert, currentTimeStamp) send_to_server(scrapeInsert) send_files_to_server() else: scrapeInsert[currentTimeStamp] = {} # save_result_to_pastebin(scrapeInsert, currentTimeStamp) send_to_server(scrapeInsert)
def get_stock_historical_prices(stockHistoricalPricesCSV) : logger.debug(pandas.read_csv(stockHistoricalPricesCSV)) return pandas.read_csv(stockHistoricalPricesCSV).to_dict('r')
def scrape_company(browser, stock): """ Contains the logic behind the scraping of an entire company's data Navigating to pages, downloading files Args: browser (Selenium.WebDriver): The automated Chrome browser stock (String): The stock ticker currently being scraped Returns: stockData (Stock): Class containing dictionaries of data """ logger.info("Current Stock: " + stock) stockInnerIteration = 0 numFuncs = 10 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Arrive at Summary & Ratios page and pull information browser.find_element_by_link_text(stock).click() summarySoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling ratio information") stockSummaryDict = get_stock_summary(summarySoup) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) stockRatioDict = get_ratios(summarySoup) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Create csv link for historical prices and pull it into a temporary folder csvLink = create_historical_prices_csv_link(stock) logger.info("Pulling historical prices information") browser.get(csvLink) # Create csv link for dividends and pull it into a temporary folder csvLink = create_historical_dividends_csv_link(stock) logger.info("Pulling historical dividends information") browser.get(csvLink) # Arrive at Annual Reports and pull latest annual report # TODO May require refactor of xpath to shorten it (Looks nicer) # TODO change dl directory outside temp # Create try catch block try: logger.info("Pulling annual report") year = int(datetime.now().strftime('%Y')) annualReportLink = create_annual_report_link(stock, str(year)) browser.get(annualReportLink) if browser.find_element_by_xpath( ".//title[contains(text(), '404 Not Found')]"): browser.execute_script( "window.history.go(-1)") # Go back to summary page annualReportLink = create_annual_report_link(stock, str(year - 1)) browser.get(annualReportLink) if browser.find_element_by_xpath( ".//title[contains(text(), '404 Not Found')]"): browser.execute_script( "window.history.go(-1)") # Go back to summary page except: pass stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # browser.execute_script("window.history.go(-1)") # Go back to summary page # Create and get the tear sheet for the company tearSheetLink = 'https://companyresearch-nzx-com.ezproxy.aut.ac.nz/tearsheets/' + stock + '.pdf' browser.get(tearSheetLink) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Arrive at Company Directory and pull directors information browser.find_element_by_xpath( ".//span[contains(text(), 'Company Directory')]").click() directorSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling Director's information") stockDirectorDict = get_director_information(directorSoup) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) browser.execute_script("window.history.go(-1)") # Go back to summary page # Arrive at Company Profile and pull description information browser.find_element_by_xpath( ".//span[contains(text(), 'Company Profile')]").click() profileSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling company description") stockProfileDict = get_company_profile(profileSoup) logger.debug(stockProfileDict) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) browser.execute_script("window.history.go(-1)") # Go back to summary page # Arrive at Financial Profile and pull debt-equity information browser.find_element_by_xpath( ".//span[contains(text(), 'Financial Profile')]").click() stockSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling financial profile information") stockFinancialProfileDict = get_financial_profile(stockSoup) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) browser.execute_script("window.history.go(-1)") # Go back to summary page # Read in the pries csv stockHistoricalPricesDict = get_stock_historical_prices( tempDirectory + stock + " Historical Prices.csv") stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Read in dividends csv stockHistoricalDividendsDict = get_stock_historical_dividends( tempDirectory + stock + " Historical Dividends.csv") stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Go back to the stock ticker page logger.info("Back to company listings") browser.execute_script("window.history.go(-1)") stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Create the stock obj and store it in an array stockData = { 'Summary': stockSummaryDict, 'Ratio': stockRatioDict, 'HistoricalPrices': stockHistoricalPricesDict, 'HistoricalDividends': stockHistoricalDividendsDict, 'FinancialProfile': stockFinancialProfileDict, 'Profile': stockProfileDict, 'Directors': stockDirectorDict } return stockData