class CongressCache: def __init__(self): self.db = SqlExecutor() def store_congress_session(self, session_num, start_date, end_date): sql = 'INSERT INTO `CONGRESSIONAL_SESSION` (NUMBER, START_DATE, END_DATE) ' \ 'VALUES(?, ?, ?)' self.db.exec_insert(sql, (session_num, start_date, end_date)) def get_all_congress_session(self): sql = 'SELECT * FROM `CONGRESSIONAL_SESSION`' result = self.db.exec_select(sql).fetchall() return result # Check if congressional_session indeed has information about a specific session def get_congress_session(self, session_num): sql = 'SELECT * FROM `CONGRESSIONAL_SESSION` WHERE NUMBER=?' result = self.db.exec_select(sql, (session_num, )).fetchone() return result
class CommitteeCache: def __init__(self): self.db = SqlExecutor() def store_committee_data(self, comm_name, comm_id, year_created, comm_purpose): sql = 'INSERT INTO `COMMITTEE` (NAME, ID, YEAR_CREATED, PURPOSE) VALUES(?, ?, ?, ?)' self.db.exec_insert(sql, (comm_name, comm_id, year_created, comm_purpose)) def get_all_committee_data(self): sql = 'SELECT * FROM `COMMITTEE`' result = self.db.exec_select(sql).fetchall() return result def get_committee_data_using_name(self, name): sql = 'SELECT * FROM `COMMITTEE` WHERE NAME=?' result = self.db.exec_select(sql, (name, )).fetchall() return result def get_committee_data_using_id(self, comm_id): sql = 'SELECT * FROM `COMMITTEE` WHERE ID=?' result = self.db.exec_select(sql, (comm_id, )).fetchall() return result def store_memberships(self, mem_id, comm_id, session_num): sql = 'INSERT INTO `COMMITTEE_MEMBERSHIP` (MEMBER_ID, COMMITTEE_ID, SESSION) VALUES(?, ?, ?)' self.db.exec_insert(sql, (mem_id, comm_id, session_num)) def get_all_current_committee_members_comm_id(self, comm_id, session_num): sql = 'SELECT * FROM `COMMITTEE_MEMBERSHIP` INNER JOIN `MEMBER` ON `COMMITTEE_MEMBERSHIP`.MEMBER_ID=`MEMBER`.ID ' \ 'WHERE COMMITTEE_ID=?' result = self.db.exec_select(sql, (comm_id, session_num)).fetchall() return result def get_all_committees_of_member(self, mem_id, session_num): sql = 'SELECT * FROM `COMMITTEE_MEMBERSHIP` INNER JOIN `COMMITTEE` ' \ 'ON `COMMITTEE_MEMBERSHIP`.COMMITTEE_ID=`COMMITTEE`.ID ' \ 'WHERE MEMBER_ID=?' result = self.db.exec_select(sql, (mem_id, session_num)).fetchall() return result
class PresidentCache: def __init__(self): self.db = SqlExecutor() def store_president(self, number, name, birth_year, death_year, took_office, left_office, party): sql = 'INSERT INTO `PRESIDENT` (NUMBER, NAME, BIRTH_YEAR, DEATH_YEAR, TOOK_OFFICE, LEFT_OFFICE, PARTY)' \ 'VALUES(?, ?, ?, ?, ?, ?, ?)' self.db.exec_insert(sql, (number, name, birth_year, death_year, took_office, left_office, party)) def get_president_by_num(self, number): sql = 'SELECT * FROM `PRESIDENT` WHERE NUMBER=?' result = self.db.exec_select(sql, (number, )).fetchone() return result def store_president_session(self, session_number, president_number): sql = 'INSERT INTO `PRESIDENT_TO_SESSION` (SESSION_NUMBER, PRESIDENT_NUMBER) VALUES(?, ?)' self.db.exec_insert(sql, (session_number, president_number)) def get_president_from_session(self, session_number): sql = 'SELECT * FROM `PRESIDENT_TO_SESSION` INNER JOIN `PRESIDENT` ON ' \ '`PRESIDENT_TO_SESSION`.PRESIDENT_NUMBER=`PRESIDENT`.NUMBER ' \ 'WHERE SESSION_NUMBER=?' result = self.db.exec_select(sql, (session_number, )).fetchall() return result def get_session_from_president(self, president_number): sql = 'SELECT * FROM `PRESIDENT_TO_SESSION` INNER JOIN `CONGRESSIONAL_SESSION` ON ' \ '`PRESIDENT_TO_SESSION`.SESSION_NUMBER=`CONGRESSIONAL_SESSION`.NUMBER ' \ 'WHERE PRESIDENT_NUMBER=?' result = self.db.exec_select(sql, (president_number, )).fetchall() return result def check_president_session_cache(self, president_number, session_number): sql = 'SELECT * FROM `PRESIDENT_TO_SESSION` WHERE PRESIDENT_NUMBER=? AND SESSION_NUMBER=?' result = self.db.exec_select( sql, (president_number, session_number)).fetchone() return result
class AMemberCache: _member_table = None _session_table = None _db = None def _setup(self, member_table, session_table): self._member_table = member_table self._session_table = session_table self._db = SqlExecutor() def store_member_to_session(self, session_num, member_id): sql = 'INSERT INTO `%s` (SESSION_NUMBER, MEMBER_ID) VALUES(?, ?)' % self._session_table self._db.exec_insert(sql, (session_num, member_id)) def get_all(self): sql = 'SELECT * FROM `%s`' % self._member_table result = self._db.exec_select(sql).fetchall() return result def get_by_id(self, member_id): sql = 'SELECT * FROM `%s` WHERE ID=?' % self._member_table result = self._db.exec_select(sql, (member_id, )).fetchone() return result def get_by_last_name(self, member_last_name): sql = 'SELECT * FROM `%s` WHERE LAST_NAME=?' % self._member_table result = self._db.exec_select(sql, (member_last_name, )).fetchall() return result def get_by_first_name(self, member_first_name): sql = 'SELECT * FROM `%s` WHERE FIRST_NAME=?' % self._member_table result = self._db.exec_select(sql, (member_first_name, )).fetchall() return result def get_by_district(self, district): sql = 'SELECT * FROM `%s` WHERE DISTRICT=?' % self._member_table result = self._db.exec_select(sql, (district, )).fetchall() return result def get_from_session(self, session_number): sql = 'SELECT * FROM `' + self._session_table + '` INNER JOIN `' + self._member_table + '` ON ' \ '`' + self._session_table + '`.MEMBER_ID=`' + self._member_table + '`.ID ' \ 'WHERE SESSION_NUMBER=?' result = self._db.exec_select(sql, (session_number, )).fetchall() return result def get_session_by_id(self, member_id): sql = 'SELECT * FROM `' + self._session_table + '` INNER JOIN `CONGRESSIONAL_SESSION` ON ' \ '`' + self._session_table + '`.SESSION_NUMBER=`CONGRESSIONAL_SESSION`.NUMBER ' \ 'WHERE MEMBER_ID=?' result = self._db.exec_select(sql, (member_id, )).fetchall() return result
class APIUsageTracker: def __init__(self): self._executor = SqlExecutor() def get_use_count(self, key): api_key_hash = self.hash_key(key) result = self._executor.exec_select('SELECT * FROM `API_META` WHERE API_KEY=?', (api_key_hash,)).fetchone() if result is None: return None else: # if the data is not from today, delete it if datetime.datetime.strptime(result[2], '%Y-%m-%d').day != datetime.date.today().day: self.clear_metadata_for_key(key) return None return result[1] def update_use_count(self, key, value): api_key_hash = self.hash_key(key) result = self.get_use_count(key) if result is None: self._executor.exec_insert('INSERT INTO `API_META` (API_KEY, USAGES) VALUES (?, ?)', (api_key_hash, value)) else: self._executor.exec_insert('UPDATE `API_META` SET USAGES=? WHERE API_KEY=?', (value, api_key_hash)) return value def clear_metadata_for_key(self, key): api_key_hash = self.hash_key(key) self._executor.exec_insert('DELETE FROM `API_META` WHERE API_KEY=?', (api_key_hash,)) @staticmethod def hash_key(key): key = key.encode('utf-8') m = hashlib.sha256() m.update(key) return m.hexdigest()
class AVCache: def __init__(self): self.db = SqlExecutor() def store_result(self, ticker, req_class, payload): sql = 'INSERT INTO `AV_CACHE` (TICKER, CLASS, PAYLOAD) VALUES (?, ?, ?)' self.db.exec_insert(sql, (ticker, req_class, payload)) def check_cache(self, ticker, req_class, force_reload=False): sql = 'SELECT * FROM `AV_CACHE` WHERE TICKER=? and CLASS=?' result = self.db.exec_select(sql, (ticker, req_class)).fetchone() if result is None: return None # if the data is not from today, delete it and return none if datetime.datetime.strptime(result[3], "%Y-%m-%d %H:%M:%S").day != datetime.date.today().day or force_reload: self.delete_old(ticker, req_class) return None return {'ticker': result[0], 'payload': result[2]} def delete_old(self, ticker, req_class): sql = 'DELETE FROM `AV_CACHE` WHERE TICKER=? and CLASS=?' self.db.exec_insert(sql, (ticker, req_class))
class HAVCache: def __init__(self): self.db = SqlExecutor(db_name='gpp-long-term.db') # check if we already have cached data for the provided date def has_data_for_date(self, ticker, date, no_update_if_today=False): found_date = self.get_last_retrieved(ticker) # if no found date, then it isn't in the cache at all if found_date is None: return False # found_date is saturday or sunday and is today, don't update cache if DateUtil.dates_match(date, found_date) and DateUtil.is_weekend( date) and DateUtil.is_today(date): return True # if the date is today and it isn't the weekend, we need to update our cache always if DateUtil.is_today(date) and not no_update_if_today: return False # if the date in the metadata is greater than the requested date # we already have data for this date, otherwise we need to go get it return found_date > date or (no_update_if_today and DateUtil.is_today(date)) def store_result_meta_data(self, ticker, last_retrieved): found = self.get_last_retrieved(ticker) # if there's already a metadata record, just update it if found is not None: sql = 'UPDATE `HISTORIC_META_DATA` SET LAST_RETRIEVED=? WHERE TICKER=?' self.db.exec_insert(sql, (last_retrieved, ticker)) else: sql = 'INSERT INTO `HISTORIC_META_DATA` (TICKER, LAST_RETRIEVED) VALUES (?, ?)' self.db.exec_insert(sql, (ticker, last_retrieved)) def store_result_data(self, ticker, date, payload): sql = 'INSERT INTO `HISTORIC_DATA` (TICKER, DATE, OPEN, HIGH, LOW, CLOSE, VOLUME) ' \ 'VALUES(?, ?, ?, ?, ?, ?, ?)' # check to make sure we're not overwriting something data = self.get_daily_quote(ticker, date) if data is not None: self.db.exec_insert( 'DELETE FROM `HISTORIC_DATA` WHERE `TICKER`=? AND `DATE`=?', (ticker, date)) to_send = (ticker, date) for item in payload: to_send = to_send + (item, ) self.db.exec_insert(sql, to_send) # Checks whether specific date is actually in the cache def check_cache(self, ticker, date): # don't try the DB before we know if the data will be there if not self.has_data_for_date(ticker, date): return None result = self.get_daily_quote(ticker, date) if result is None: return None return { 'ticker': result[0], 'date': result[1], 'open': result[2], 'high': result[3], 'low': result[4], 'close': result[5], 'volume': result[6] } def get_last_retrieved(self, ticker): sql = 'SELECT * FROM `HISTORIC_META_DATA` WHERE TICKER=?' result = self.db.exec_select(sql, (ticker, )).fetchone() if result is None: return None found_timestamp = result[1] return found_timestamp def get_all_data(self, ticker): sql = 'SELECT * FROM `HISTORIC_DATA` WHERE TICKER=?' result = self.db.exec_select(sql, (ticker, )).fetchall() return result def get_rolling_window_quotes(self, ticker, end_date, num_desired): if not self.has_data_for_date( ticker, end_date, no_update_if_today=True): return None sql = 'SELECT * FROM `HISTORIC_DATA` WHERE TICKER=? AND DATE <= ? ORDER BY DATE DESC LIMIT ?' result = self.db.exec_select( sql, (ticker, end_date, num_desired)).fetchall() return result def get_daily_quote(self, ticker, date): sql = 'SELECT * FROM `HISTORIC_DATA` WHERE TICKER=? AND DATE=?' result = self.db.exec_select(sql, (ticker, date)).fetchone() return result def flush(self, ticker): sql = 'DELETE FROM `HISTORIC_DATA` WHERE TICKER=?' self.db.exec_insert(sql, (ticker, ))
from db.SqlExecutor import SqlExecutor # TRANSCRIBE_FILE = '../../data/tickers/compiled/all_tickers.txt' TRANSCRIBE_FILE = '../../data/tickers/raw/etfs/etfs.txt' INSERT_SQL = "INSERT INTO `COMPANY` (TICKER) VALUES (?);" executor = SqlExecutor(debug=True) with open(TRANSCRIBE_FILE) as all_tickers: for ticker_text in all_tickers: ticker = ticker_text.strip() result = executor.exec_select( "SELECT COUNT(*) AS ROW_COUNT FROM `COMPANY` WHERE `TICKER`=?", (ticker, )) if result.fetchone()[0] == 1: print("Already found record for ticker %s, continuing..." % (ticker, )) continue print(ticker) executor.exec_insert(INSERT_SQL, (ticker, )) rows = executor.exec_select('SELECT COUNT(*) AS rowcount FROM COMPANY;') for row in rows: print(row[0])
from db.SqlExecutor import SqlExecutor INSERT_SQL = "INSERT INTO `COMPANY` (TICKER) VALUES ('$$REP$$');" executor = SqlExecutor(debug=True) with open('../../data/tickers/compiled/all_tickers.txt') as all_tickers: for ticker_text in all_tickers: ticker = ticker_text.strip() sql = INSERT_SQL.replace('$$REP$$', ticker) executor.exec_insert(sql) rows = executor.exec_select('SELECT COUNT(*) AS rowcount FROM COMPANY;') for row in rows: print(row[0])
class BillCache: def __init__(self): self.db = SqlExecutor() self.congress_cache = CongressCache() def store_keyword_to_bill(self, key_word, bill_id, weight): sql = 'INSERT INTO `BILL_KEYWORDS` (KEYWORD, BILL_ID, WEIGHT) VALUES (?, ?, ?)' self.db.exec_insert(sql, (key_word, bill_id, weight)) def store_bill(self, id, title, short_title, congress_session, intro_date, bill_url, active, enacted, vetoed, summary, latest_major_action): sql = 'INSERT INTO `BILL` (ID, TITLE, SHORT_TITLE,' \ 'CONGRESS_SESSION, INTRODUCED_DATE, BILL_URL,' \ 'ACTIVE, ENACTED, VETOED, SUMMARY, LATEST_MAJOR_ACTION) VALUES (?, ?, ?, ?, ?, ' \ '?, ?, ?, ?, ?, ?)' self.db.exec_insert( sql, (id, title, short_title, congress_session, intro_date, bill_url, active, enacted, vetoed, summary, latest_major_action)) # Note: make sure data is in tuple form def get_all_bills(self): sql = 'SELECT * FROM `BILL`' result = self.db.exec_select(sql).fetchall() return result def get_bill_from_session(self, session): sql = 'SELECT * FROM `BILL` WHERE CONGRESS_SESSION=?' result = self.db.exec_select(sql, (session, )).fetchall() return result def get_bill_from_bill_id(self, bill_id): sql = 'SELECT * FROM `BILL` WHERE ID=?' result = self.db.exec_select(sql, (bill_id, )).fetchone() return result def get_all_cosponsors_for_bill(self, bill_id): sql = 'SELECT * FROM `COSPONSOR` INNER JOIN `MEMBER` ON `COSPONSOR`.MEMBER_ID=`MEMBER`.ID WHERE BILL_ID=?' result = self.db.exec_select(sql, (bill_id, )).fetchall() return result def get_all_cosponsored_bills_for_member(self, member_id): sql = 'SELECT * FROM `COSPONSOR` INNER JOIN `BILL` ON `COSPONSOR`.BILL_ID=`BILL`.ID WHERE MEMBER_ID=?' result = self.db.exec_select(sql, (member_id, )).fetchall() return result def get_bills_from_keyword(self, keyword): sql = 'SELECT * FROM `BILL_KEYWORDS` WHERE KEYWORD=?' result = self.db.exec_select(sql, (keyword, )).fetchall() return result def get_top_bills_from_keywords(self, keywords): bill_keyword = {} for keyword in keywords: result_list = self.get_bills_from_keyword(keyword) for result_dict in result_list: bill_id = result_dict.get('BILL_ID') weight = result_dict.get('WEIGHT') if bill_id not in bill_keyword: bill_keyword.update({bill_id: weight}) else: old_weight = bill_keyword.get(bill_id) bill_keyword.update({bill_id: weight + old_weight}) return sorted(bill_keyword.items(), key=lambda x: x[1], reverse=True) def get_all_keywords(self): sql = 'SELECT * FROM `BILL_KEYWORDS`' result = self.db.exec_select(sql).fetchall() return result def check_cache(self, bill_id): session_num = int(bill_id[len(bill_id) - 3:len(bill_id)]) result = self.congress_cache.get_congress_session(session_num) if result is not None: return self.get_bill(bill_id) return None # If no metadata found due to congressional_session, then bill is not there
def get_ticker(ticker): executor = SqlExecutor() result = executor.exec_select('SELECT * FROM COMPANY WHERE TICKER=?', (ticker, )) return result.fetchone()
def get_n_stocks(n=5): executor = SqlExecutor() result = executor.exec_select( 'SELECT * FROM COMPANY WHERE CLASS=? AND NAME IS NOT NULL LIMIT ?', (Ticker.STOCK, n)) return result.fetchall()
def get_ticker_with_class(ticker, fd_class): executor = SqlExecutor() result = executor.exec_select( 'SELECT * FROM COMPANY WHERE TICKER=? AND CLASS=?', (ticker, fd_class)) return result.fetchone()
class MarketWatchScraper: STOCK_URL_PATTERN = 'https://www.marketwatch.com/investing/stock/$$TICKER$$/profile' FUND_URL_PATTERN = 'https://www.marketwatch.com/investing/fund/$$TICKER$$' TICKER_PATTERN = '$$TICKER$$' DESCRIPTION_PATTERN = '$$DESC$$' def __init__(self): self.executor = SqlExecutor() @staticmethod def _curl_for_html(url): # slow down crawls to prevent lockout by marketwatch time.sleep(0.5) html_out = os.popen('curl -s "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (K HTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" ' + url) return html_out.read() @staticmethod def _get_name(body, name_locator, ticker, resource_type): if '<div class="important">There were no matches found for ' in body: print("No data found at all for %s ticker %s, skipping..." % (resource_type, ticker)) return False name = HTMLParser.grep_for(body, name_locator) if name == '': print("No name found for %s ticker %s, skipping..." % (resource_type, ticker)) return False return name def update_stock_metadata(self, ticker): update_sql_template = \ "UPDATE `COMPANY` SET NAME=?, DESCRIPTION=?, INDUSTRY=?, SECTOR=?, REVENUE=?, NET_INCOME=?, EMPLOYEES=?, " \ "RESOURCE_URL=?, CLASS='STOCK' WHERE TICKER='$$TICKER$$';" resource_url = MarketWatchScraper.STOCK_URL_PATTERN.replace(MarketWatchScraper.TICKER_PATTERN, ticker) raw_html = MarketWatchScraper._curl_for_html(resource_url) name = MarketWatchScraper._get_name(raw_html, 'id="instrumentname"', ticker, 'stock') if not name: return False description = HTMLParser.grep_for(raw_html, 'div class="full"', 2) industry = HTMLParser.grep_for(raw_html, '>Industry<', 1) sector = HTMLParser.grep_for(raw_html, 'Sector', 1) employees = HTMLParser.grep_for(raw_html, 'Employees', 2) net_income = HTMLParser.grep_for(raw_html, 'Net Income', 1) revenue = HTMLParser.grep_for(raw_html, '>Revenue<', 1) sql = update_sql_template.replace(MarketWatchScraper.TICKER_PATTERN, ticker) self.executor.exec_insert(sql, (name, description, industry, sector, revenue, net_income, employees, resource_url)) print('%s: %s - found and loaded!' % (ticker, name)) return True def update_fund_metadata(self, ticker): update_sql_template = \ "UPDATE `COMPANY` SET NAME=?, DESCRIPTION=?, RESOURCE_URL=?, CLASS='FUND' WHERE TICKER='$$TICKER$$';" resource_url = MarketWatchScraper.FUND_URL_PATTERN.replace(MarketWatchScraper.TICKER_PATTERN, ticker) raw_html = MarketWatchScraper._curl_for_html(resource_url) name = MarketWatchScraper._get_name(raw_html, 'class="company__name"', ticker, 'fund') if not name: return False description = HTMLParser.grep_for(raw_html, 'class="description__text"', 2) sql = update_sql_template.replace(MarketWatchScraper.TICKER_PATTERN, ticker) self.executor.exec_insert(sql, (name, description, resource_url)) print('%s: %s - found and loaded!' % (ticker, name)) def update_all_tickers(self): rows = self.executor.exec_select('SELECT * FROM COMPANY WHERE NAME IS NULL') all_rows = rows.fetchall() for row in all_rows: success = self.update_stock_metadata(row[0]) if not success: self.update_fund_metadata(row[0])