def create_historical_prices_csv_link(stockSummaryDict) : fromDate = (datetime.now() - timedelta(days=365*3)).strftime('%Y-%m-%d') toDate = datetime.now().strftime('%Y-%m-%d') csvLink = "https://companyresearch-nzx-com.ezproxy.aut.ac.nz/deep_ar/functions/csv_prices.php?" csvLink += ("default=" + stockSummaryDict["Ticker"] + "&" + "fd=" + fromDate + "&" + "td=" + toDate) logger.info("Pulling historical price data from: " + csvLink) return csvLink
def send_files_to_server(): """ This method is used to retrieve all pdf files from the temp folder and send them to the appropriate URL """ fileList = os.listdir("temp") fileIteration = 0 pdfIteration = 0 destinationURL = getDestinationURL() logger.info("Sending files to: " + destinationURL) for file in fileList: fileIteration += 1 fileJSON = {} if file.endswith(".pdf"): pdfIteration += 1 with open(os.path.join(r'temp', file), 'rb') as fileContent: fileJSON[file] = fileContent r = requests.post(destinationURL, files=fileJSON) logger.info("Sent file: " + file) printProgressBar( fileIteration, len(fileList), prefix='Saving {} data'.format(file).ljust(24), suffix='| {} files completed'.format(pdfIteration), length=10)
def get_browser(): """ Creates a chrome driver which will be used by selenium to conduct the website navigation Sets the following options to aid in webscraping - Auto file download - Removal the images - Disables internal pdf viewer Returns: webdriver: Driver for site navigation """ # Set up driver options chromeOptions = Options() chromeOptions.add_argument('log-level=3') # Remove warnings chromeOptions.add_argument('--disable-gpu') chromeOptions.add_argument('headless') chromeOptions.add_argument("--proxy-server='direct://'") chromeOptions.add_argument("--proxy-bypass-list=*") chromeOptions.add_argument('--no-proxy-server') prefs = { "download.default_directory": downloadDirectory, # Sets default directory for downloads "directory_upgrade": True, # Provides write permissions to the directory "plugins.always_open_pdf_externally": True, # Disables the built-in pdf viewer (Helps with pdf download) "safebrowsing.enabled": True, # Tells driver all file downloads and sites are safe "download.prompt_for_download": False, # Auto downloads files into default directory "profile.managed_default_content_settings.images": 2 } # Removes images for faster load times chromeOptions.add_experimental_option("prefs", prefs) browser = webdriver.Chrome(chromeDriverLocation, chrome_options=chromeOptions) # Apply options browser.command_executor._commands["send_command"] = ( "POST", '/session/$sessionId/chromium/send_command') params = { 'cmd': 'Page.setDownloadBehavior', 'params': { 'behavior': 'allow', 'downloadPath': downloadDirectory } } browser.execute("send_command", params) homeURL = "https://library.aut.ac.nz/databases/nzx-deep-archive" browser.get(homeURL) delay = 15 # seconds # Wait 15 seconds for the driver to get started and get to the landing page try: myElem = WebDriverWait(browser, delay).until( EC.presence_of_element_located((By.CLASS_NAME, "form-field"))) logger.info("Browser is ready!") except TimeoutException: logger.error("Loading took too much time!") logger.info("get_browser() complete") print("Chromium open") return browser
def score_companies(stockDataArray): """ Scores each company based on their own values compared to other companies. For this, we are using the geometric average to get a more accurate represention. Args: stockDataArray (List): dictionary of all company information """ normalisationRanges = find_normal_ranges(stockDataArray) for stock in stockDataArray: debtEquityIndexValue = findDebtEquityIndexValue( stock, normalisationRanges['Debt Equity Max'], normalisationRanges['Debt Equity Min']) netDividendYield = findNetDividendYield( stock, normalisationRanges['Dividend Yield Max'], normalisationRanges['Dividend Yield Min']) sharpeRatioIndexValue = findSharpeRatioIndexValue( stock, normalisationRanges['Sharpe Ratio Max'], normalisationRanges['Sharpe Ratio Min']) returnOnEquityIndexValue = findReturnOnEquityIndexValue( stock, normalisationRanges['Return on Equity Max'], normalisationRanges['Return on Equity Min']) # Geometric average to make score more accurate score = (debtEquityIndexValue * sharpeRatioIndexValue * returnOnEquityIndexValue * netDividendYield)**0.25 stock['Summary']['Score'] = score logger.info("{} | Score: {}".format(stock['Summary']['Ticker'], score)) print("{} got a score of: {}".format(stock['Summary']['Ticker'], score))
def create_historical_dividends_csv_link(stockTicker): """ Creates a csv link used to download the historical dividends csv. Using todays date, and 3 years prior Args: stockTicker (String): Contains the ticker of the company Returns: csvLink (String): url which holds the csv """ csvLink = "https://companyresearch-nzx-com.ezproxy.aut.ac.nz/deep_ar/divhistory_csv.php?selection=" + stockTicker logger.info("Pulling historical dividend data from: " + csvLink) return csvLink
def print_overview_sheet(workbook, stockDataArray, formats): logger.info("Printing Overview") overviewSheet = workbook.add_worksheet("Overview") overviewSheet.write_string(0, 0, "Stocks") row = 1 col = 0 for stock in stockDataArray: overviewSheet.write_url( row, col, "internal:" + stock.stockSummaryDict["Ticker"] + "_Summary!A1", string=stock.stockSummaryDict["Name"]) row += 1
def send_to_server(scrapeInsert): """ Sends the given JSON object to the appropriate URL Args: scrapeInsert (JSON): JSON object with all company information """ destinationURL = getDestinationURL() headers = {'Content-type': 'application/json', 'Accept': 'text/plain'} r = requests.post(destinationURL, data=json.dumps(scrapeInsert), headers=headers) logger.info("Sent JSON data to {}".format(destinationURL)) logger.info("Received response {}".format(r.status_code))
def findSharpeRatioIndexValue(stock, max, min): """ Args: stock (Dict): dictionary of company information max (Float): the maximum sharpe ratio within this scrape + 1 min (Float): the minimum sharpe ratio within this scrape 1 Returns: index (Float): The normalised value of the company's sharpe ratio (Always between 0 and 1) """ stockSharpeRatio = stock['Ratio']['Sharpe Ratio'] index = (stockSharpeRatio - min) / (max - min) stock['Summary']['Sharpe Ratio Index'] = index logger.info("{} | Sharpe: {}".format(stock['Summary']['Ticker'], index)) return index
def findReturnOnEquityIndexValue(stock, max, min): """ Args: stock (Dict): dictionary of company information max (Float): the maximum return on equity within this scrape + 1 min (Float): the minimum return on equity within this scrape 1 Returns: index (Float): The normalised value of the company's return on equity (Always between 0 and 1) """ stockRoE = stock['Ratio']['Return on Equity'] index = (stockRoE - min) / (max - min) stock['Summary']['Return on Equity Index'] = index logger.info("{} | RoE Index: {}".format(stock['Summary']['Ticker'], index)) return index
def findNetDividendYield(stock, max, min): """ Args: stock (Dict): dictionary of company information max (Float): the maximum dividend yield within this scrape + 1 min (Float): the minimum dividend yield within this scrape 1 Returns: index (Float): The normalised value of the company's dividend yield (Always between 0 and 1) """ netDividendYield = stock['Ratio']['Net Yield'] index = (netDividendYield - min) / (max - min) stock['Summary']['Net Dividend Yield Index'] = index logger.info("{} | Yield: {}".format(stock['Summary']['Ticker'], index)) return index
def findDebtEquityIndexValue(stock, max, min): """ Args: stock (Dict): dictionary of company information max (Float): the maximum debt equity within this scrape + 1 min (Float): the minimum debt equity within this scrape 1 Returns: index (Float): The normalised value of the company's debt equity (Always between 0 and 1) """ stockDebtEquity = stock['Ratio']['Debt Equity'] index = 1 - ((stockDebtEquity - min) / (max - min)) stock['Summary']['Debt Equity Index'] = index logger.info("{} | Debt Equity: {}".format(stock['Summary']['Ticker'], index)) return index
def analyse_company_risk(stockDataArray): """ For each company, creates a list of that company's stock price. The standard deviation of this list is used as an indicator for risk. Saves the calculate risk score into the Summary Dictionary. Args: stockDataArray (List): dictionary of all company information """ for stock in stockDataArray: priceData = stock['HistoricalPrices'] priceList = [] for price in priceData: priceList.append(price['Last']) risk = statistics.stdev(priceList) logger.info("{} | Risk: {}".format(stock['Summary']['Ticker'], risk)) stock['Summary']['Risk'] = risk
def create_historical_prices_csv_link(stockTicker): """ Creates a csv link used to download the historical prices csv. Using todays date, and 3 years prior Args: stockTicker (String): Contains the ticker of the company Returns: csvLink (String): url which holds the csv """ fromDate = (datetime.now() - timedelta(days=365 * 3)).strftime('%Y-%m-%d') toDate = datetime.now().strftime('%Y-%m-%d') csvLink = "https://companyresearch-nzx-com.ezproxy.aut.ac.nz/deep_ar/functions/csv_prices.php?" csvLink += ("default=" + stockTicker + "&" + "fd=" + fromDate + "&" + "td=" + toDate) logger.info("Pulling historical price data from: " + csvLink) return csvLink
def print_summary_sheet(workbook, stock, formats): logger.info(" Printing Summary & Ratios for " + stock.stockSummaryDict["Ticker"]) row = 0 col = 0 worksheet = workbook.add_worksheet(stock.stockSummaryDict["Ticker"] + "_Summary") for key, value in stock.stockSummaryDict.items(): worksheet.write_string(row, col, key) worksheet.write_string(row, col + 2, value) row += 1 worksheet.write_url(0, col + 5, "internal:Overview!A1", string="BACK") worksheet.write_url(0, col + 7, "internal:" + stock.stockSummaryDict["Ticker"] + "_HistoricalPrices!A1", string="Historical Prices")
def save_log_to_pastebin(): """ This method is used to retrieve logs from Heroku, where it would otherwise be impossible It sends any log files to Pastebin, where we can monitor how it is functioning """ pastebinApiURL = 'https://pastebin.com/api/api_post.php' dev_key = '5f996bee7fa49af7481927ddce874367' user_key = '77787566e1fa286ab849d7b0e22169c9' # Check number of pastes dataList = {} dataList['api_dev_key'] = dev_key dataList['api_option'] = 'list' dataList['api_user_key'] = user_key r = requests.post(pastebinApiURL, data=dataList) pastesString = "<pastes>" + r.text + "</pastes>" root = ElementTree.fromstring(pastesString) numPastes = len(root.findall('paste')) # Find the oldest paste then delete it if numPastes == 10: logger.info("10 Pastes found, deleting one to make paste for next one") oldestPaste = root[0][0].text oldestDate = int(root[0][1].text) for paste in root: if int(paste[1].text) < oldestDate: oldestPaste = paste[0].text oldestDate = int(paste[1].text) dataDelete = {} dataDelete['api_dev_key'] = dev_key dataDelete['api_option'] = 'delete' dataDelete['api_user_key'] = user_key dataDelete['api_paste_key'] = oldestPaste r = requests.post(pastebinApiURL, data=dataDelete) logger.info("Deleted {} paste".format(oldestPaste)) logger.info("Sending logs to Pastebin") logger.info("Bye, Felicia") dataPaste = {} dataPaste['api_dev_key'] = dev_key dataPaste['api_option'] = 'paste' with open("python_logging.log", "r") as logging_file: dataPaste['api_paste_code'] = logging_file.read() dataPaste['api_user_key'] = user_key dataPaste['api_paste_name'] = 'JSON Scrape Backup ' + str(datetime.now()) # dataPaste['api_paste_format'] = 'json' dataPaste['api_paste_private'] = '2' dataPaste['api_paste_expire_date'] = '6M' r = requests.post(pastebinApiURL, data=dataPaste) print("New Paste at: " + r.text)
def print_financial_profile_sheet(workbook, stock, formats): logger.info(" Printing Financial Profile for " + stock.stockSummaryDict["Ticker"]) row = 0 col = 0 # Create sheet worksheet = workbook.add_worksheet(stock.stockSummaryDict["Ticker"] + "_FinancialProfile") # Print Headers & Values logger.debug(stock.stockFinancialProfileDict.items()) for key, value in stock.stockFinancialProfileDict.items(): worksheet.write_string(row, col, key) worksheet.write_string(row, col + 1, value) row += 1 worksheet.write_url(0, 13, "internal:" + stock.stockSummaryDict["Ticker"] + "_Summary!A1", string="BACK")
def print_historical_dividends_sheet(workbook, stock, formats): if stock.stockHistoricalDividendsDict is not None: logger.info(" Printing Historical Dividends for " + stock.stockSummaryDict["Ticker"]) row = 0 col = 0 # Create sheet worksheet = workbook.add_worksheet(stock.stockSummaryDict["Ticker"] + "_HistoricalDividends") # Print Headers keys = stock.stockHistoricalDividendsDict[0].keys() logger.debug(stock.stockHistoricalDividendsDict[0].keys()) for key in keys: worksheet.write_string(row, col, key) col += 1 worksheet.write_url(row, col + 13, "internal:" + stock.stockSummaryDict["Ticker"] + "_Summary!A1", string="BACK") row = 1 col = 0 # Print Items for rowItems in stock.stockHistoricalDividendsDict: logger.debug(rowItems) for key, value in rowItems.items(): logger.debug(value) if (key == 'Date'): worksheet.write_datetime( row, col, datetime.strptime(value, '%d %b %Y'), formats['dateFormat']) else: worksheet.write_number(row, col, float(value)) col += 1 row += 1 col = 0
def print_excel(stockDataArray): logger.info("Printing excel document") # Create excel workbook workbook = xlsxwriter.Workbook('StockDB.xlsx') # Excel Cell Formats formats = {} formats['dateFormat'] = workbook.add_format({'num_format': 'd mmm yyyy'}) formats['moneyFormat'] = workbook.add_format({'num_format': '$#,##0'}) formats['number2decFormat'] = workbook.add_format({'num_format': '#.##'}) print_overview_sheet(workbook, stockDataArray, formats) for stock in stockDataArray: print_summary_sheet(workbook, stock, formats) print_historical_prices_sheet(workbook, stock, formats) print_Directors(workbook, stock, formats) print_company_profile(workbook, stock, formats) print_historical_dividends_sheet(workbook, stock, formats) print_financial_profile_sheet(workbook, stock, formats) print_ratios_db(workbook, stockDataArray, formats) workbook.close()
def find_normal_ranges(stockDataArray): """ Finds the max and min for each index. To prevent a company from receiving a perfect or 0 zero, a buffer of 1 has been added. Args: stockDataArray (List): dictionary of all company information Returns: Dict: Contains the max and min of each index """ normalisationRanges = { "Dividend Yield Max": 0, "Dividend Yield Min": 0, "Return on Equity Max": 0, "Return on Equity Min": 0, "Sharpe Ratio Max": 0, "Sharpe Ratio Min": 0, "Debt Equity Max": 0, "Debt Equity Min": 0, } for stock in stockDataArray: # Dividend Yield Ranges if stock['Ratio']['Net Yield'] >= normalisationRanges[ 'Dividend Yield Max']: normalisationRanges[ 'Dividend Yield Max'] = stock['Ratio']['Net Yield'] + 1 if stock['Ratio']['Net Yield'] <= normalisationRanges[ 'Dividend Yield Min']: normalisationRanges[ 'Dividend Yield Min'] = stock['Ratio']['Net Yield'] - 1 # Return on Equity Ranges netIncome = stock['FinancialProfile']['Data']['Income']['Net Income'] shareholderEquity = stock['FinancialProfile']['Data']['Balance'][ 'Total Equity'] stock['Ratio']['Return on Equity'] = (netIncome / shareholderEquity) * 100 if stock['Ratio']['Return on Equity'] >= normalisationRanges[ 'Return on Equity Max']: normalisationRanges['Return on Equity Max'] = stock['Ratio'][ 'Return on Equity'] + 1 if stock['Ratio']['Return on Equity'] <= normalisationRanges[ 'Return on Equity Min']: normalisationRanges['Return on Equity Min'] = stock['Ratio'][ 'Return on Equity'] - 1 # Sharpe Ratio if stock['Ratio']['Sharpe Ratio'] >= normalisationRanges[ 'Sharpe Ratio Max']: normalisationRanges[ 'Sharpe Ratio Max'] = stock['Ratio']['Sharpe Ratio'] + 1 if stock['Ratio']['Sharpe Ratio'] <= normalisationRanges[ 'Sharpe Ratio Min']: normalisationRanges[ 'Sharpe Ratio Min'] = stock['Ratio']['Sharpe Ratio'] - 1 # Debt Equity totalLiability = stock['FinancialProfile']['Data']['Balance'][ 'Total Liabilities'] totalEquity = stock['FinancialProfile']['Data']['Balance'][ 'Total Equity'] stock['Ratio']['Debt Equity'] = totalLiability / totalEquity if stock['Ratio']['Debt Equity'] >= normalisationRanges[ 'Debt Equity Max']: normalisationRanges[ 'Debt Equity Max'] = stock['Ratio']['Debt Equity'] + 1 if stock['Ratio']['Debt Equity'] <= normalisationRanges[ 'Debt Equity Min']: normalisationRanges[ 'Debt Equity Min'] = stock['Ratio']['Debt Equity'] - 1 logger.info(normalisationRanges) return normalisationRanges
def save_data(stockDataArray, success): """ Constructs a dictionary of company information. Converts it JSON, and sends it externally using send_to_server() Args: stockDataArray (List): dictionary of all company information success (Boolean): To indicate whether the scraping was succesful, to identify if processing needs to occur """ currentTimeStamp = datetime.now().strftime('%Y/%m/%d') scrapeInsert = {currentTimeStamp: {'Date': currentTimeStamp}} if success: logger.info("Saving data") print("Saving data") dividendInsert = {'Data': {}, 'Name': 'HistoricalDividends'} priceInsert = {'Data': {}, 'Name': 'HistoricalPrices'} stockIteration = 0 # Select stock for stock in stockDataArray: currentStockTicker = stock['Summary']['Ticker'] logger.info("Saving data for: " + currentStockTicker) stockInsert = {} # Create stock dict from scraped data for sectionKey, sectionData in stock.items(): logger.info(sectionKey) sectionInsert = {} if sectionKey == 'HistoricalPrices': for line in sectionData: logger.debug(line) dateString = line.pop('Date') dateString = (datetime.strptime( dateString, '%d %b %Y')).strftime("%Y-%m-%d") sectionInsert[dateString] = line stockInsert[sectionKey] = sectionInsert elif sectionKey == 'HistoricalDividends': try: for line in sectionData: logger.debug(line) dateString = line.pop('Date') dateString = (datetime.strptime( dateString, '%d %b %Y')).strftime("%Y-%m-%d") sectionInsert[dateString] = line.pop( 'Dividend Paid') stockInsert[sectionKey] = sectionInsert except TypeError: pass else: for elementKey, elementValue in sectionData.items(): sectionInsert[elementKey] = elementValue stockInsert[sectionKey] = sectionInsert scrapeInsert[currentTimeStamp][stock['Summary'] ['Ticker']] = stockInsert stockIteration += 1 printProgressBar( stockIteration, len(stockDataArray), prefix='Saving {} data'.format(stock['Summary']['Ticker']), suffix='of {} companies completed'.format(len(stockDataArray))) with open('data.txt', 'w') as outfile: json.dump(scrapeInsert, outfile, indent=4) # save_result_to_pastebin(scrapeInsert, currentTimeStamp) send_to_server(scrapeInsert) send_files_to_server() else: scrapeInsert[currentTimeStamp] = {} # save_result_to_pastebin(scrapeInsert, currentTimeStamp) send_to_server(scrapeInsert)
from nzxscraper import logger startTime = time() browser = get_browser() try: stockTickersList = list_companies(browser) # Initialise the array which is going to store Stock class objects stockDataArray = [] # For each ticker in the list, find the link to the respective summary page for stock in stockTickersList: stockData = scrape_company(browser, stock) stockDataArray.append(stockData) logger.info("Scraping complete") finally: browser.quit() logger.info("Temporary files deleted") shutil.rmtree(downloadDirectory) # print_excel(stockDataArray) # logger.info("Excel ready") if DEBUG: endTime = time() logger.info("That took a total of: " + str(round(endTime - startTime)) + " seconds.") logger.info( str(round((endTime - startTime) / COMPANIES)) + " seconds per company.")
def create_historical_dividends_csv_link(stockTicker) : csvLink = "https://companyresearch-nzx-com.ezproxy.aut.ac.nz/deep_ar/divhistory_csv.php?selection=" + stockTicker logger.info("Pulling historical dividend data from: " + csvLink) return csvLink
def scrape_company(browser, stock): """ Contains the logic behind the scraping of an entire company's data Navigating to pages, downloading files Args: browser (Selenium.WebDriver): The automated Chrome browser stock (String): The stock ticker currently being scraped Returns: stockData (Stock): Class containing dictionaries of data """ logger.info("Current Stock: " + stock) stockInnerIteration = 0 numFuncs = 10 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Arrive at Summary & Ratios page and pull information browser.find_element_by_link_text(stock).click() summarySoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling ratio information") stockSummaryDict = get_stock_summary(summarySoup) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) stockRatioDict = get_ratios(summarySoup) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Create csv link for historical prices and pull it into a temporary folder csvLink = create_historical_prices_csv_link(stock) logger.info("Pulling historical prices information") browser.get(csvLink) # Create csv link for dividends and pull it into a temporary folder csvLink = create_historical_dividends_csv_link(stock) logger.info("Pulling historical dividends information") browser.get(csvLink) # Arrive at Annual Reports and pull latest annual report # TODO May require refactor of xpath to shorten it (Looks nicer) # TODO change dl directory outside temp # Create try catch block try: logger.info("Pulling annual report") year = int(datetime.now().strftime('%Y')) annualReportLink = create_annual_report_link(stock, str(year)) browser.get(annualReportLink) if browser.find_element_by_xpath( ".//title[contains(text(), '404 Not Found')]"): browser.execute_script( "window.history.go(-1)") # Go back to summary page annualReportLink = create_annual_report_link(stock, str(year - 1)) browser.get(annualReportLink) if browser.find_element_by_xpath( ".//title[contains(text(), '404 Not Found')]"): browser.execute_script( "window.history.go(-1)") # Go back to summary page except: pass stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # browser.execute_script("window.history.go(-1)") # Go back to summary page # Create and get the tear sheet for the company tearSheetLink = 'https://companyresearch-nzx-com.ezproxy.aut.ac.nz/tearsheets/' + stock + '.pdf' browser.get(tearSheetLink) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Arrive at Company Directory and pull directors information browser.find_element_by_xpath( ".//span[contains(text(), 'Company Directory')]").click() directorSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling Director's information") stockDirectorDict = get_director_information(directorSoup) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) browser.execute_script("window.history.go(-1)") # Go back to summary page # Arrive at Company Profile and pull description information browser.find_element_by_xpath( ".//span[contains(text(), 'Company Profile')]").click() profileSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling company description") stockProfileDict = get_company_profile(profileSoup) logger.debug(stockProfileDict) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) browser.execute_script("window.history.go(-1)") # Go back to summary page # Arrive at Financial Profile and pull debt-equity information browser.find_element_by_xpath( ".//span[contains(text(), 'Financial Profile')]").click() stockSoup = BeautifulSoup(browser.page_source, 'lxml') logger.info("Pulling financial profile information") stockFinancialProfileDict = get_financial_profile(stockSoup) stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) browser.execute_script("window.history.go(-1)") # Go back to summary page # Read in the pries csv stockHistoricalPricesDict = get_stock_historical_prices( tempDirectory + stock + " Historical Prices.csv") stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Read in dividends csv stockHistoricalDividendsDict = get_stock_historical_dividends( tempDirectory + stock + " Historical Dividends.csv") stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Go back to the stock ticker page logger.info("Back to company listings") browser.execute_script("window.history.go(-1)") stockInnerIteration += 1 printProgressBar(stockInnerIteration, numFuncs, prefix='Scraping {} data'.format(stock), suffix='of {} completed'.format(stock)) # Create the stock obj and store it in an array stockData = { 'Summary': stockSummaryDict, 'Ratio': stockRatioDict, 'HistoricalPrices': stockHistoricalPricesDict, 'HistoricalDividends': stockHistoricalDividendsDict, 'FinancialProfile': stockFinancialProfileDict, 'Profile': stockProfileDict, 'Directors': stockDirectorDict } return stockData
def list_companies(browser): """ Creates a list which will be used to iterate through selected companies Args: browser (Selenium.WebDriver): The automated Chrome browser Returns: stockNames (List): list of company tickers to be scraped """ # Login browser.find_element_by_xpath('//*[@id="username"]').send_keys(username) browser.find_element_by_xpath('//*[@id="password"]').send_keys(password) browser.find_element_by_xpath('//*[@id="login"]/section[4]/button').click() logger.info("Logged into NZX System") # Arrive at Market Activity Page browser.find_element_by_xpath( ".//a[contains(text(), 'Company Research')]").click() logger.info("Arrived at Market Activity Page") # Click "View all" for main market browser.find_elements_by_xpath( ".//a[contains(text(), 'view all')]")[0].click() logger.info("Arrived at Market Overview Page") # Sort in descending order by clicking the 26th "a" tag browser.find_elements_by_css_selector('td > a')[25].click() logger.info( "Arrived at Market Overview sorted by marketcap in descending order") # Parse the page source into BeautifulSoup # The page is the list of stocks in Descending order of Market Cap html = browser.page_source htmlSoup = BeautifulSoup(html, 'lxml') logger.info("Market Overview Page parsed") # Put all the stock tickers into a list stocksSoup = htmlSoup.find_all('a', {'class': 'text'}, limit=COMPANIES) stockNames = [] for stock in stocksSoup: stockNames.append(stock.getText()) logger.info("List of companies to scrape finalised") return stockNames
def save_data(): logger.info("Im free")