def train(self): LOGGER.info("Starting to train...") ticker_names = get_ticker_names(self.db, "TSX")[:10] random.shuffle(ticker_names) sdate = datetime.today().date() - timedelta(days=2*365) edate = sdate + timedelta(days=365) data, targets = [], [] for t in ticker_names: c = self.rc.calculate_return(t, sdate, edate) if c is None: continue d = self.ff.fetch(t, sdate, edate) d = list(itertools.chain.from_iterable(d)) targets.append(c) data.append(d) train_tickers = ceil(len(targets) * self.training_perc) self.training_data = data[:train_tickers] self.training_target = targets[:train_tickers] self.goal_data = data[train_tickers:] self.goal_target = targets[train_tickers:] LOGGER.info("Starting to train...") print("TEST", self.training_data, self.training_target) self.lr.fit(self.training_data, self.training_target)
def ticker_history(self, start, end, ticker, info='quote', start_dic={}): """ Gets and returns the historic prices for a given ticker for between the time period provided. Inclusive. """ start_str = start.strftime('%Y%m%d') end_str = end.strftime('%Y%m%d') # info = 'quote', 'dividend', 'split' try: data = yqd.load_yahoo_quote(ticker, start_str, end_str, info=info) except (HTTPError, URLError, gaierror) as e: LOGGER.info("Yahoo request failed. Blocked?") return [] titles = tuple(t.replace(' ', '_').lower() for t in data[0].split(',')) history = [] for row in data[1:-1]: history_row = {k: v for k, v in start_dic.items()} iter_list = row.split(',') for element, title in zip(iter_list, titles): converted = self.convert_yahoo_element(element) history_row[title] = converted history.append(history_row) return history
def train(self): LOGGER.info("Starting to train...") train_data, train_targets, test_data, test_targets = self.fh.fetch_feature_data( ) tickers = self.fh.fetch_feature_tickers() print("Shapes", train_data.shape, train_targets.shape, test_data.shape, test_targets.shape) features_names = self.fh.fetch_feature_names() self.kbest.fit(train_data, train_targets) feature_scores = self.kbest.scores_ combined = [ tuple((s, f)) for s, f in zip(feature_scores, features_names) if not np.isnan(s) ] best_features = sorted(combined, key=lambda x: -x[0]) scores, names = zip(*best_features) self.graph(scores, names) print('Best Features', scores[:5]) print('Worst Features', best_features[-5:])
def binary_train(self): LOGGER.info("Starting to train...") train_data, train_targets, test_data, test_targets = self.fh.fetch_binary_feature_data( ) train_tickers, test_tickers = self.fh.fetch_feature_tickers() scaler = StandardScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.fit_transform(test_data) print("Shapes", train_data.shape, train_targets.shape, test_data.shape, test_targets.shape) self.mlpc.fit(train_data, train_targets) predictions = self.mlpc.predict(test_data) acc_score = accuracy_score(test_targets, predictions) roc_score = roc_auc_score(test_targets, predictions) print('???', accuracy_score(test_targets, [True] * len(test_targets))) print("Accuracy Score", acc_score, 'ROC Score', roc_score) print("Average True Return", sum(train_targets) / len(train_targets))
def get_historic_events(self): """ Gets all the historical events from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = self.session.query( Listings.ticker, Listings.dateoflisting).filter(Listings.exchange == exchange) dict_fields = ["index", "action", "value"] fields = ["exchange", "ticker", "date", "action", "value"] total_listings = listings.count() for counter, (ticker, listdate) in enumerate(listings): lastdate, = self.session.query(func.max( EventHistory.updatedate)).filter( EventHistory.exchange == exchange, EventHistory.ticker == ticker).first() startdate = listdate if lastdate is None else lastdate + timedelta( days=1) rows = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" dividend_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info='dividend') split_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info='split') rows = [] for row in dividend_dict: rows.append([ exchange, ticker, row["date"], "DIVIDEND", row["dividends"], self.today ]) for row in split_dict: rows.append([ exchange, ticker, row["date"], "SPLIT", row["stock_splits"], self.today ]) if rows: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter + 1, total_listings, ticker, startdate, self.today)) stmt = insert(EventHistory).values( rows).on_conflict_do_nothing( constraint='event_history_pkey') self.session.execute(stmt) self.session.commit() else: LOGGER.info("{}/{} Skipping ticker {}".format( counter + 1, total_listings, ticker))
def fetch_all(self, exchange): extension = '.TO' tickers = tuple(x.ticker + extension for x in self.session.query(Listings.ticker).filter(Listings.exchange == exchange, or_(Listings.onyahoo == True, Listings.onyahoo is None))) ticker_groups = self.chunks(tickers, 200) LOGGER.info("Fetching/Updating {} urls.".format(len(ticker_groups))) for ticker_group in ticker_groups: url = self.create_url(ticker_group) self.handle_url(ticker_group, url, exchange) sleep(1) # limit requests to 1/s
def binary_train(self): LOGGER.info("Starting to train...") train_data, train_targets, test_data, test_targets = self.fh.fetch_binary_feature_data() train_tickers, test_tickers = self.fh.fetch_feature_tickers() print("Shapes", train_data.shape, train_targets.shape, test_data.shape, test_targets.shape) self.lc.fit(train_data, train_targets) predictions = self.lc.predict(test_data) acc_score = accuracy_score(test_targets, predictions) roc_score = roc_auc_score(test_targets, predictions) print("Accuracy Score", acc_score, 'ROC Score', roc_score) print("Average True Return", sum(train_targets) / len(train_targets), sum(test_targets) / len(test_targets))
def get_historic_prices(self): """ Gets all the historical prices from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = list( self.session.query( Listings.ticker, Listings.dateoflisting).filter(Listings.exchange == exchange)) total_listings = len(listings) for counter, (ticker, listdate) in enumerate(listings): lastdate, = self.session.query(func.max(PriceHistory.date)).filter( PriceHistory.exchange == exchange, PriceHistory.ticker == ticker).first() startdate = listdate if lastdate is None else lastdate + timedelta( days=1) his_dict = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" start_dic = {"exchange": exchange, "ticker": ticker} his_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info="quote", start_dic=start_dic) if his_dict: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter, total_listings, ticker, startdate, self.today)) for d in his_dict: stmt = insert(PriceHistory).values( d).on_conflict_do_update( constraint='price_history_pkey', set_=d) self.session.execute(stmt) self.session.commit() else: LOGGER.info("{}/{} Skipping ticker {}".format( counter, total_listings, ticker))
def train(self): LOGGER.info("Starting to train...") train_data, train_targets, test_data, test_targets = self.fh.fetch_feature_data( ) tickers = self.fh.fetch_feature_tickers() print("Shapes", train_data.shape, train_targets.shape, test_data.shape, test_targets.shape) self.svm.fit(train_data, train_targets) predictions = self.svm.predict(test_data) mean_err = mean_absolute_error(test_targets, predictions) mean_s_err = mean_squared_error(test_targets, predictions) print("Got Mean error", mean_err, "Squared Err", mean_s_err) print("Average Expected Return", sum(predictions) / len(predictions)) print("Average True Return", sum(train_targets) / len(train_targets))
def go(self, urls): LOGGER.info("Preparing threads...") manager = Manager() urlQ = manager.Queue() for url in urls: urlQ.put(url) procs = [ Process(target=get_html, args=(urlQ, self.callback, self.xpath_hooks), daemon=True) for i in range(self.nproc) ] LOGGER.info("Threads started. Fetching n' parsing!") for proc in procs: proc.start() for proc in procs: proc.join()
def convert_yahoo_element(self, element): converted = None try: converted = float(element) except ValueError: try: converted = datetime.strptime(element, "%Y-%m-%d") except ValueError: if element == 'null': converted = None elif '/' in element: try: a, b = element.split('/') converted = float(a) / float(b) except ValueError: LOGGER.info("Unable to convert {}".format(element)) else: LOGGER.info("Unable to convert {}".format(element)) return converted
def fetch_all(self, exchange): q = self.session.query(Listings).filter( Listings.exchange == exchange, or_(Listings.onyahoo == True, Listings.onyahoo is None)) extension = '.TO' urls = [ "https://ca.finance.yahoo.com/quote/{}{}/key-statistics".format( l.ticker, extension) for l in q ] xpath_hooks = [ "//section[@data-test='qsp-statistics']", "//section[@data-test='lookup-page']" ] LOGGER.info("Fetching/Updating {} urls.".format(len(urls))) jsps = JSPageScraper(self.dic_parse, xpath_hooks, "key_statistics") jsps.go(urls)
def get_key_stats(self, ticker, db_exchange="TSX"): """ This function get key statistics from Morning Star. """ url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&" "culture=en-CA®ion=CAN&order=asc&r={}").format( ticker, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting key statistics for {}... ".format(ticker)) else: LOGGER.info("Skipping", ticker) return 1 col_names, vals = self.parse_csv( csv_r, 10, self.special_key_titles, self.column_key_map, extra_cols=["ticker", "exchange", "update_date"], extra_vals=[ticker, db_exchange, self.today]) self.db.insert_into("ms_key_statistics", col_names, vals, unique_conflict=True) LOGGER.info("Done") return 0
def get_historic_prices(self): """ Gets all the historical prices from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = self.db.select("ticker, dateoflisting", "listings", where="exchange = %s", vals=[exchange]) dict_fields = ["Adj Close", "High", "Close", "Open", "Low", "Date"] fields = ["exchange", "ticker"] + [x.lower() for x in dict_fields] total_listings = len(listings) for counter, (ticker, listdate) in enumerate(listings): lastdate = self.db.select("MAX(date)", "price_history", fetch="one", where="exchange = %s AND ticker = %s", vals=[exchange, ticker], unroll=True) startdate = listdate if lastdate is None else lastdate + timedelta( days=1) his_dict = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" his_dict = self.ticker_price_history(startdate, self.today, yahoo_ticker) if his_dict: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter, total_listings, ticker, startdate, self.today)) rows = [[exchange, ticker] + [row[k] for k in dict_fields] for row in his_dict] self.db.insert_into("price_history", fields, rows) else: LOGGER.info("{}/{} Skipping ticker {}".format( counter, total_listings, ticker))
def get_financial(self, ticker, period_name, exchange="XTSE"): """ This function get yearly and quartly information from Morning Star. period_name: "quarter" or "annual" exchanges: XTSE (TSX), """ # this converts the morning star exchange name to our database name if exchange in self.exchange_map: db_exchange = self.exchange_map[exchange] else: raise ValueError("Exchange unsupported {}".format(exchange)) period = 3 if period_name == "quarter" else 12 url = ( "http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t=" "{}:{}®ion=can&culture=en-US&cur=&reportType=is&period={}&" "dataType=A&order=desc&columnYear=5&curYearPart=1st5year&" "rounding=1&view=raw&r={}&denominatorView=raw&number=1").format( exchange, ticker, period, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting {} financial data for {}... ".format( period_name, ticker)) else: LOGGER.info("Skipping {}".format(ticker)) return 1 num_cols = 6 if period_name == "quarter" else 5 # skip last column if not quarter view (removes TTM) return_dics = self.parse_csv(csv_r, num_cols, self.special_financials_titles, self.column_financials_map, start_dic={ "ticker": ticker, "exchange": db_exchange, "period": period, "update_date": self.today }) for d in return_dics: stmt = insert(MorningStarFinancials).values( d).on_conflict_do_update(constraint='fiscal_year_unique', set_=d) self.session.execute(stmt) self.session.commit() LOGGER.info("Done") return 0
def screen_and_save_feature_data(self): train_ticker_names, test_ticker_names = self.fetch_feature_tickers() train_data, train_targets, test_data, test_targets = self.fetch_feature_data( ) tickers = set(find_small_cap_tickers( self.sess)) # finds ticker < 10m value train_rm_indexes = [] for i, (ticker, target) in enumerate(zip(train_ticker_names, train_targets)): if target > 10 or ticker in tickers: train_rm_indexes.append(i) test_rm_indexes = [] for i, (ticker, target) in enumerate(zip(test_ticker_names, test_targets)): if target > 10 or ticker in tickers: test_rm_indexes.append(i) train_ticker_names = np.delete(train_ticker_names, train_rm_indexes, axis=0) train_data = np.delete(train_data, train_rm_indexes, axis=0) train_targets = np.delete(train_targets, train_rm_indexes, axis=0) test_ticker_names = np.delete(test_ticker_names, test_rm_indexes, axis=0) test_data = np.delete(test_data, test_rm_indexes, axis=0) test_targets = np.delete(test_targets, test_rm_indexes, axis=0) LOGGER.info("Saving file at: {}".format(self.file_path)) np.savez(self.file_path, train_data=train_data, train_targets=train_targets, train_ticker_names=train_ticker_names, test_data=test_data, test_targets=test_targets, test_ticker_names=test_ticker_names)
def dic_parse(self, session, url, html): def innerHtml(ele): return ele.decode_contents(formatter="html") soup = BeautifulSoup(html, "lxml") ticker = self.url_ticker_pat.search(url).group(1) exchange = "TSX" on_yahoo = soup.find('section', attrs={'data-test': 'lookup-page' }) is None session.query(Listings).filter(Listings.exchange == exchange, Listings.ticker == ticker).update( {Listings.onyahoo: on_yahoo}) if not on_yahoo: # if quote not found, exit LOGGER.error("Failed to find quote for {} skipping".format(url)) return div_test = soup.find('section', attrs={'data-test': 'qsp-statistics'}) if div_test is None: LOGGER.error("Unknown error for {} skipping".format(url)) return db_dic = {} for table in div_test.find_all('table'): for row in table.find_all('tr'): td_list = row.find_all('td') title = innerHtml(td_list[0].find('span')) val = innerHtml(td_list[1]) if td_list[1].find( 'span') is None else innerHtml(td_list[1].find('span')) if title in self.y_to_db_map: db_dic[self.y_to_db_map[title]] = self.parse_numeric(val) if db_dic: db_dic["ticker"] = ticker db_dic["exchange"] = exchange exists = session.query(KeyStatistics).filter_by( **db_dic).scalar() is not None if exists: LOGGER.info("Skipping {} due to prior existence".format(url)) else: db_dic["update_date"] = self.today stmt = insert(KeyStatistics).values( db_dic).on_conflict_do_nothing( constraint='key_statistics_pkey', ) session.execute(stmt) session.commit() LOGGER.info("Done parsing {}".format(url)) else: LOGGER.info("Skipping {}".format(url))
def dic_parse(self, db, url, html): def innerHtml(ele): return ele.decode_contents(formatter="html") soup = BeautifulSoup(html, "lxml") ticker = self.url_ticker_pat.search(url).group(1) exchange = "TSX" on_yahoo = soup.find('div', attrs={'data-test': 'unknown-quote' }) is None db.update("listings", ["onyahoo"], [on_yahoo], "exchange=%s AND ticker=%s", [exchange, ticker]) if not on_yahoo: # if quote not found, exit LOGGER.error("Failed to find quote for", url, "skipping") return div_test = soup.find('div', attrs={'data-test': 'qsp-statistics'}) if div_test is None: LOGGER.error("Unknown error for", url, "skipping") return db_dic = {} for table in div_test.find_all('table'): for row in table.find_all('tr'): td_list = row.find_all('td') title = innerHtml(td_list[0].find('span')) val = innerHtml(td_list[1]) if td_list[1].find( 'span') is None else innerHtml(td_list[1].find('span')) if title in self.y_to_db_map: db_dic[self.y_to_db_map[title]] = self.parse_numeric(val) if db_dic: db_dic["ticker"] = ticker db_dic["exchange"] = exchange col_names, vals = list(db_dic.keys()), list(db_dic.values()) where = db.create_conditional_string(col_names) if db.exists("key_statistics", where, vals): LOGGER.info("Skipping {} due to prior existence".format(url)) else: col_names.append("update_date") vals.append(self.today) db.insert_into("key_statistics", col_names, vals, multiple=False) LOGGER.info("Done parsing {}".format(url)) else: LOGGER.info("Skipping {}".format(url))
def handle_url(self, tickers, url, exchange): """ Fetches the url and inserts the data into the appropriate cols in the DB. """ LOGGER.info("Starting to add url: {} ...".format(url)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) db_list = [] for row, ticker in zip(csv_r, tickers): assert(len(row) == len(self.url_flags)) db_dic = {db_col: self.handle_csv_string(cell) for cell, db_col in zip(row, self.db_entries)} onyahoo = any(v is not None for v in db_dic.values()) self.session.query(Listings).filter(Listings.exchange == exchange, Listings.ticker == ticker ).update({Listings.onyahoo: onyahoo}) if not onyahoo: # not found, skip LOGGER.error("Failed to find quote for {} skipping".format(ticker)) continue db_dic["ticker"] = ticker db_dic["exchange"] = exchange exists = self.session.query(YahooKeyStatistics).filter_by(**db_dic).scalar() is not None if exists: LOGGER.info("Skipping {} due to prior existence".format(ticker)) continue db_dic["update_date"] = self.today # Annoyingly enough, sqlalchemy doesn't allow PostgreSQL bulk inserts # when checking constraints, RIP performance stmt = insert(YahooKeyStatistics).values(db_dic).on_conflict_do_nothing( constraint = 'yahoo_key_statistics_pkey', ) self.session.execute(stmt) self.session.commit() LOGGER.info("Done url.")
def get_key_stats(self, ticker, db_exchange="TSX"): """ This function get key statistics from Morning Star. """ url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&" "culture=en-CA®ion=CAN&order=asc&r={}").format( ticker, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting key statistics for {}... ".format(ticker)) else: LOGGER.info("Skipping {}".format(ticker)) return 1 return_dics = self.parse_csv(csv_r, 10, self.special_key_titles, self.column_key_map, start_dic={ "ticker": ticker, "exchange": db_exchange, "update_date": self.today }) for d in return_dics: stmt = insert(MorningStarKeyStatistics).values( d).on_conflict_do_update(constraint='ms_key_statistics_pkey', set_=d) self.session.execute(stmt) self.session.commit() LOGGER.info("Done") return 0
def get_quotes(self): """ This function gets the tickers and various other random information from the TSX website from a hardcoded file and inserts it into the database """ recent_date = self.db.select("MAX(updatedate)", "listings", fetch="one", unroll=True) if self.url.startswith("http"): req = create_url_request(self.url) self.url = urllib.request.urlopen(req) sheet = pd.read_excel(self.url, skiprows=5, header=1, keep_default_na=False) sheet.fillna('', inplace=True) sheet.rename(columns=self.cleanse_str, inplace=True) file_date = self.find_date_in_list(list(sheet.columns.values)) if recent_date is None or (file_date > recent_date): xlsx_dict = sheet.to_dict(orient="records") recent_date = file_date if self.cache_path: self.write_cache(recent_date, sheet) else: LOGGER.info("Already up to date") return row_names = [ "updatedate", "ticker", "exchange", "name", "sector", "osshares", "dateoflisting", "listingtype", "volume", "value", ] all_excel_names = tuple(xlsx_dict[0].keys()) base_wanted_excel_names = [ "Root Ticker", "Exchange", "Name", "Sector", "O/S", "Date of TSX Listing", "Listing Type", "Volume YTD", "Value (C$)", ] types = [ "str", "str", "str", "str", "int", "date", "str", "int", "int", ] wanted_excel_names = [] for bxn in base_wanted_excel_names: for xn in all_excel_names: if xn.startswith(bxn): wanted_excel_names.append(xn) break num_rows = len(wanted_excel_names) table_name = "listings" values = [] for row in xlsx_dict: value_list = [recent_date] for i in range(num_rows): excel_name = wanted_excel_names[i] val = row[excel_name] if types[i] == "date": val = datetime.strptime(str(val), "%Y%m%d") # assume YYYYMMDD value_list.append(val) values.append(value_list) self.db.insert_into(table_name, row_names, values)
def generate_and_save_feature_data(self, independent=True): rc = ReturnCalculator() ticker_names = sorted(get_ms_ticker_names(self.sess, "TSX")) num_tickers = len(ticker_names) train_data, train_targets = [], [] train_ticker_names = [] test_data, test_targets = [], [] test_ticker_names = [] imp = Imputer(missing_values='NaN', strategy='mean', axis=0) for i, t in enumerate(ticker_names, 1): LOGGER.info("[{:d}/{:d}] Working on {}...".format( i, num_tickers, t)) dates = self.ff.ms_key_stats_date(self.sess, t) if len(dates) < 1: continue date_gap = dates[1] - dates[0] if len(dates) > 2 else timedelta( days=365) last_date = dates[-1] rows = self.ff.ms_key_stats_data(self.sess, t) if not independent: # Window sliding for time series empty_row = tuple((None, )) * len(rows[0]) new_rows = [] for i in range(len(rows)): first_part = rows[i - 1] if i > 0 else empty_row second_part = rows[i] new_rows.append(first_part + second_part) rows = new_rows # Add the start date to the list of dates return_dates = [dates[0] - date_gap] + dates returns = rc.calculate_return_between_dates(t, return_dates) for row, date, ret in zip(rows, dates, returns): if ret is None: # if return date are out of range continue if date == last_date: test_data.append(row) test_targets.append(ret) test_ticker_names.append(t) else: train_data.append(row) train_targets.append(ret) train_ticker_names.append(t) # Convert the python lists to numpy arrays and fill missing values train_data = np.array(train_data, dtype=np.float) imp = imp.fit(train_data) train_ticker_names = np.array(train_ticker_names, dtype=np.str) train_data = imp.transform(train_data) train_targets = np.array(train_targets, dtype=np.float) test_ticker_names = np.array(test_ticker_names, dtype=np.str) test_data = imp.transform(np.array(test_data, dtype=np.float)) test_targets = np.array(test_targets, dtype=np.float) if not os.path.exists(self.dir_path): os.makedirs(self.dir_path) LOGGER.info("Saving file at: {}".format(self.file_path)) np.savez(self.file_path, train_data=train_data, train_targets=train_targets, train_ticker_names=train_ticker_names, test_data=test_data, test_targets=test_targets, test_ticker_names=test_ticker_names)
def get_quotes(self): """ This function gets the tickers and various other random information from the TSX website from a hardcoded file and inserts it into the database """ recent_date, = self.session.query(func.max( Listings.updatedate)).first() if self.url.startswith("http"): req = create_url_request(self.url) self.url = urllib.request.urlopen(req) sheet = pd.read_excel(self.url, skiprows=5, header=1, keep_default_na=False) sheet.fillna('', inplace=True) sheet.rename(columns=self.cleanse_str, inplace=True) file_date = self.find_date_in_list(list(sheet.columns.values)) if recent_date is None or (file_date > recent_date): xlsx_dict = sheet.to_dict(orient="records") recent_date = file_date else: LOGGER.info("Already up to date") return row_names = [ "ticker", "exchange", "name", "sector", "osshares", "dateoflisting", "listingtype", "volume", "value", ] all_excel_names = tuple(xlsx_dict[0].keys()) base_wanted_excel_names = [ "Root Ticker", "Exchange", "Name", "Sector", "O/S", "Date of TSX Listing", "Listing Type", "Volume YTD", "Value (C$)", ] wanted_excel_names = [] for bxn in base_wanted_excel_names: for xn in all_excel_names: if xn.startswith(bxn): wanted_excel_names.append(xn) break assert (len(base_wanted_excel_names) == len(wanted_excel_names) == len(row_names)) value_dics = [] for row in xlsx_dict: value_dic = {"updatedate": recent_date} for excel_name, row_name in zip(wanted_excel_names, row_names): val = row[excel_name] if row_name == "dateoflisting": val = datetime.strptime(str(val), "%Y%m%d") # assume YYYYMMDD if val == '': val = None value_dic[row_name] = val value_dics.append(value_dic) self.session.execute(insert(Listings).values(value_dics)) self.session.commit()
def write_cache(self, date, sheet): os.makedirs(self.cache_path, exist_ok=True) json_name = date.strftime('TSX-%Y-%m-%d.json') full_path = os.path.join(self.cache_path, json_name) sheet.to_json(full_path, orient="records") LOGGER.info("Wrote file to {}".format(full_path))