def test_indicators_from_json(self): # Create new company symbol = "ZZZZZ" name = "Fake Name" d = {'symbol': symbol, "roe": 0.25, "fcf": 100.0, "name": name} Indicators.from_json(d) c = Company.query.filter_by(symbol=symbol) self.assertIsNotNone(c) # use existing company d = {"roe": 0.25, "fcf": 100.0} Indicators.from_json(d)
def get_ratio_data(): import socket import re import time import dryscrape import webkit_server from random import randint from fake_useragent import UserAgent from bs4 import BeautifulSoup from selenium.webdriver import PhantomJS from app.models import Company, Indicators from app.utils import cash_to_float, depercentize # Dict item with list: element attribute, attribute value to look for, optional transform function indicators = {'roe': { 'attribute': 'data-reactid', 'value': re.compile(".*RETURN_ON_EQUITY\.1$"), 'transform': depercentize, }, 'fcf': { 'attribute': 'data-reactid', 'value': re.compile(".*LEVERED_FREE_CASH_FLOW\.1$"), 'transform': cash_to_float, }, } ua = UserAgent() #with open("10.csv", "r") as f: #with open("sp500-2.csv", "r") as f: with open("10_stocks", "r") as f: data = f.read() symbols = [] for i in data.split("\n"): if i: symbols.append(i.split(",")[0]) print("Iterate through symbols") ## dryscrape #session = dryscrape.Session() #session.set_header('User-Agent', ua.random) #session.set_timeout(5) for symbol in symbols: print("{} Fetching {} :".format(time.strftime("%H:%M:%S"), symbol)) import pdb; pdb.set_trace() #driver = MyWebDriver() driver = PhantomJS() driver.set_window_size(1120, 550) driver.get("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) ##try: ## session = dryscrape.Session() ##except socket.error as e: ## print("Failed to configure session {}".format(e)) ## continue ##session.set_header('User-Agent', ua.random) ##session.set_timeout(30) #try: # #session.visit("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) #except Exception as e: # print e, "try once more......" # session.reset() # time.sleep(5) # session = dryscrape.Session() # #session.set_header('User-Agent', ua.random) # try: # session.set_timeout(5) # session.visit("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) # except Exception as e: # print e, "done trying..." # session.reset() # time.sleep(2) # session = dryscrape.Session() # continue #except socket.error as e: # print("Failed to get {}, {} (1)".format(symbol, e)) # continue #except webkit_server.EndOfStreamError as e: # print("Failed to get {}, {}, breaking (2)".format(symbol, e)) # continue #except webkit_server.InvalidResponseError as e: # print("Failed to get {}, {}, breaking (3)".format(symbol, e)) # continue #response = session.body() #soup = BeautifulSoup(response, "lxml") with open("{}.out".format(symbol), "w") as f: f.write(driver.page_source.encode('utf-8')) soup = BeautifulSoup(driver.page_source, "lxml") d = {'symbol': symbol} for indicator in indicators.keys(): curr_ind = indicators[indicator] s = soup.find_all(attrs={curr_ind['attribute']: curr_ind['value']}) print indicator, s for element in s: if curr_ind.has_key('transform'): f = curr_ind['transform'] #print(f(element.text)) d[indicator] = f(element.text) else: #print(element.text) d[indicator] = element.text try: db.session.add(Indicators.from_json(d)) db.session.commit() except (IntegrityError, UnmappedInstanceError) as e: print "Caught", e db.session.rollback() print "indicators", d
def get_ratio_data(): import re import time from bs4 import BeautifulSoup from app.models import Company, Indicators from app.utils import cash_to_float, depercentize # Dict item with list: element attribute, attribute value to look for, optional transform function indicators = {'roe': { 'attribute': 'data-reactid', 'value': re.compile(".*RETURN_ON_EQUITY\.1$"), 'transform': depercentize, }, 'fcf': { 'attribute': 'data-reactid', 'value': re.compile(".*LEVERED_FREE_CASH_FLOW\.1$"), 'transform': cash_to_float, }, 'ev2ebitda': { 'attribute': 'data-reactid', 'value': re.compile(".*ENTERPRISE_VALUE_TO_EBITDA\.1$"), }, } companies = Company.query.with_entities(Company.symbol).all() symbols = [company[0] for company in companies] print("Iterate through symbols") for symbol in symbols: print("{} Fetching {} :".format(time.strftime("%H:%M:%S"), symbol)) #driver = MyWebDriver() retry_current = 0 retry_limit = 5 while retry_current < retry_limit: try: driver = PhantomJS() except URLError: time.sleep(retry_current**2) retry_current += 1 driver.set_window_size(1120, 550) driver.get("http://finance.yahoo.com/quote/{}/key-statistics?p={}".format(symbol, symbol)) try: #WebDriverWait(driver, 10).until(EC.title_contains("AAPL Key Statistics | Apple Inc. Stock - Yahoo Finance")) #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "td[reactid]"))) #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//td[@data-reactid[ends-with(., 'RETURN_ON_EQUITY.1')]]"))) # these two seem to work... element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[substring(@data-reactid, string-length(@data-reactid) - string-length('RETURN_ON_EQUITY.1') +1) = 'RETURN_ON_EQUITY.1']"))) #element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[contains(@data-reactid,'RETURN_ON_EQUITY.1')]"))) #"//input[@id[ends-with(.,'register')]]" except TimeoutException as e: print "Caught", e print driver.title continue #time.sleep(5) #with open("{}.out".format(symbol), "w") as f: # f.write(driver.page_source.encode('utf-8')) soup = BeautifulSoup(driver.page_source, "lxml") d = {'symbol': symbol} for indicator in indicators.keys(): curr_ind = indicators[indicator] s = soup.find_all(attrs={curr_ind['attribute']: curr_ind['value']}) print indicator, s for element in s: if curr_ind.has_key('transform'): f = curr_ind['transform'] #print(f(element.text)) d[indicator] = f(element.text) else: #print(element.text) d[indicator] = element.text try: db.session.add(Indicators.from_json(d)) db.session.commit() except (IntegrityError, UnmappedInstanceError) as e: print "Caught", e db.session.rollback() print "indicators", d