Пример #1
0
class Features():
    def __init__(self, normalize=True):
        self.db = Database()
        self.ms_financials_cols = self.db.get_col_names("ms_financials")
        self.not_feature_cols = {'ticker', 'exchange', 'update_date', 'fiscal_year', 'period'}
        self.ms_financials_cols = [x for x in self.ms_financials_cols if x not in self.not_feature_cols]
        self.ms_financials_cols_str = ','.join(self.ms_financials_cols)

    def fetch(self, ticker, lower_date, upper_date, period='3', where=""):
        cond = "ticker = %s AND period = %s AND fiscal_year >= %s AND fiscal_year <= %s ORDER BY fiscal_year ASC"
        return self.db.select(self.ms_financials_cols_str, "ms_financials",
                              where=cond, vals=[ticker, period, lower_date, upper_date])
Пример #2
0
class Features():
    def __init__(self, normalize=True):
        self.db = Database()
        self.ms_financials_cols = self.db.get_col_names("ms_financials")

    def fetch(self, ticker):
        self.db.cursor.execute("SELECT * from ms_financials LIMIT 1")
        res = self.db.cursor.fetchall()
Пример #3
0
class ReturnCalcuator():
    def __init__(self):
        self.db = Database()

    def next_business_day(self, ticker, date, before=True, fields="date"):
        cmp = "<=" if before else ">="
        return self.db.select(
            fields,
            "price_history",
            where="ticker = %s AND date {} %s ORDER BY date DESC LIMIT 1".
            format(cmp),
            vals=[ticker, date],
            fetch='one')

    def calculate_return(self, ticker, start_date, end_date):
        min_date = self.db.select("MIN(date)",
                                  "price_history",
                                  where="ticker = %s",
                                  vals=[ticker],
                                  fetch='one',
                                  unroll=True)
        max_date = self.db.select("MAX(date)",
                                  "price_history",
                                  where="ticker = %s",
                                  vals=[ticker],
                                  fetch='one',
                                  unroll=True)

        if min_date is None or max_date is None or start_date < min_date:
            return None

        start_date, start_close = self.next_business_day(
            ticker,
            start_date,
            fields='date, "adj close"',
            before=(start_date >= min_date))
        end_date, end_close = self.next_business_day(
            ticker,
            end_date,
            fields='date, "adj close"',
            before=(end_date > max_date))

        return end_close / start_close - 1
Пример #4
0
def get_html(urlQ, callback, xpath_hooks):
    """
    This page takes a url from the URL Queue (urlQ) and
    calls a callbac that will handle the page source.

    xpage_hooks is a list used to determine when the page is loaded,
    see the docs for more details (e.g. ["//div[@data-test='whatever']"] ).
    """
    svr = webkit_server.Server()
    svrconn = webkit_server.ServerConnection(server=svr)
    driver = dryscrape.driver.webkit.Driver(connection=svrconn)

    sess = dryscrape.Session(driver=driver)
    sess.set_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
    )
    sess.set_attribute("auto_load_images", False)

    valid_page_func = lambda: any(
        sess.at_xpath(xpath) for xpath in xpath_hooks)
    db = Database()

    while not urlQ.empty():
        url = urlQ.get()

        sess.visit(url)
        try:
            sess.wait_for(valid_page_func, interval=1, timeout=15)
        except dryscrape.mixins.WaitTimeoutError:
            LOGGER.error("Timeout so skipping", url)
            continue

        response = sess.body()
        callback(db, url, response)
        sess.reset()

    svr.kill()
    db.destroy()
Пример #5
0
    def __init__(self):
        BaseBot.__init__(self)
        self.db = Database()
        self.ff = Features()
        self.rc = ReturnCalcuator()
        
        self.lr = LinearRegression()

        self.training_perc = 0.8
        self.training_tickers = []
        self.training_target = []
        
        self.goal_tickers = []
        self.goal_target = []
Пример #6
0
 def __init__(self):
     self.y_to_db_map = {
         'Forward P/E': 'forward_pe',
         'Return on Equity': 'ro_equity',
         'Current Ratio': 'current_ratio',
         'Total Debt': 'total_debt',
         'Forward Annual Dividend Rate': 'forward_annual_dividend_rate',
         'Last Split Date': 'last_split_date',
         'Market Cap (intraday)': 'market_cap',
         'EBITDA': 'ebitda',
         'Shares Short': 'shares_short',
         '50-Day Moving Average': 'fifty_day_moving_avg',
         '52 Week High': 'fifty_two_week_high',
         'Quarterly Earnings Growth': 'q_earnings_growth',
         'Forward Annual Dividend Yield': 'forward_annual_dividend_yield',
         'Beta': 'beta',
         'Payout Ratio': 'payout_ratio',
         'Avg Vol (3 month)': 'avg_vol_3_month',
         'Enterprise Value': 'enterprise_value',
         '5 Year Average Dividend Yield': 'five_year_avg_dividend_yield',
         'Enterprise Value/Revenue': 'enterprise_value_revenue',
         'Trailing P/E': 'trailing_pe',
         'Total Cash': 'total_cash',
         'Operating Cash Flow': 'operating_cash_flow',
         'Price/Book': 'price_book',
         'Fiscal Year Ends': 'fiscal_year_ends',
         'Total Debt/Equity': 'total_debt_equity',
         'Dividend Date': 'dividend_date',
         'Most Recent Quarter': 'most_recent_q',
         'Operating Margin': 'operating_margin',
         'Ex-Dividend Date': 'exdividend_date',
         '% Held by Institutions': 'perc_held_by_institutions',
         'Trailing Annual Dividend Yield': 'trailing_annual_dividend_yield',
         '200-Day Moving Average': 'two_hundred_day_moving_avg',
         '52 Week Low': 'fifty_two_week_low',
         'Avg Vol (10 day)': 'avg_vol_10_day',
         'Last Split Factor (new per old)': 'last_split_factor',
         '% Held by Insiders': 'perc_held_by_insiders',
         'Revenue Per Share': 'revenue_per_share',
         'Short Ratio': 'short_ratio',
         'Shares Short (prior month)': 'shares_short_prior_month',
         'Short % of Float': 'short_perc_float',
         'Profit Margin': 'profit_margin',
         'Return on Assets': 'ro_assets',
         'Price/Sales': 'price_sales',
         'Gross Profit': 'gross_profit',
         'Book Value Per Share': 'book_value_per_share',
         'Levered Free Cash Flow': 'levered_free_cash_flow',
         'Trailing Annual Dividend Rate': 'trailing_annual_dividend_rate',
         'Diluted EPS': 'diluted_eps',
         'PEG Ratio (5 yr expected)': 'peg_ratio_5yr',
         'Shares Outstanding': 'shares_outstanding',
         'Revenue': 'revenue',
         'Float': 'float',
         'Net Income Avi to Common': 'net_income_avi_common',
         'Enterprise Value/EBITDA': 'enterprise_value_ebitda',
         '52-Week Change': 'fifty_two_week_change',
         'Quarterly Revenue Growth': 'q_revenue_growth',
         'Total Cash Per Share': 'total_cash_ps'
     }
     self.convert_dict = {'K': 10**3, 'M': 10**6, 'B': 10**9, 'T': 10**12}
     self.condensed_pat = re.compile("([+-]?\d+\.?\d*)([kmbtKMBT])")
     self.float_pat = re.compile("[+-]?\d+\.\d+")
     self.parenthese_pat = re.compile(" *\(([^)]*)\)")
     self.date_line_pat = re.compile("\(as of (\d+.*\d+)\)")
     self.url_ticker_pat = re.compile(".*/quote/(.*)\.(.*)/key-statistics")
     self.keywords = set({"mrq", "ttm", "yoy", "lfy", "fye"})
     self.today = datetime.today().date()
     self.default_fye = datetime(self.today.year, 12, 31)
     self.db = Database()
Пример #7
0
class YahooScraper():
    def __init__(self):
        self.y_to_db_map = {
            'Forward P/E': 'forward_pe',
            'Return on Equity': 'ro_equity',
            'Current Ratio': 'current_ratio',
            'Total Debt': 'total_debt',
            'Forward Annual Dividend Rate': 'forward_annual_dividend_rate',
            'Last Split Date': 'last_split_date',
            'Market Cap (intraday)': 'market_cap',
            'EBITDA': 'ebitda',
            'Shares Short': 'shares_short',
            '50-Day Moving Average': 'fifty_day_moving_avg',
            '52 Week High': 'fifty_two_week_high',
            'Quarterly Earnings Growth': 'q_earnings_growth',
            'Forward Annual Dividend Yield': 'forward_annual_dividend_yield',
            'Beta': 'beta',
            'Payout Ratio': 'payout_ratio',
            'Avg Vol (3 month)': 'avg_vol_3_month',
            'Enterprise Value': 'enterprise_value',
            '5 Year Average Dividend Yield': 'five_year_avg_dividend_yield',
            'Enterprise Value/Revenue': 'enterprise_value_revenue',
            'Trailing P/E': 'trailing_pe',
            'Total Cash': 'total_cash',
            'Operating Cash Flow': 'operating_cash_flow',
            'Price/Book': 'price_book',
            'Fiscal Year Ends': 'fiscal_year_ends',
            'Total Debt/Equity': 'total_debt_equity',
            'Dividend Date': 'dividend_date',
            'Most Recent Quarter': 'most_recent_q',
            'Operating Margin': 'operating_margin',
            'Ex-Dividend Date': 'exdividend_date',
            '% Held by Institutions': 'perc_held_by_institutions',
            'Trailing Annual Dividend Yield': 'trailing_annual_dividend_yield',
            '200-Day Moving Average': 'two_hundred_day_moving_avg',
            '52 Week Low': 'fifty_two_week_low',
            'Avg Vol (10 day)': 'avg_vol_10_day',
            'Last Split Factor (new per old)': 'last_split_factor',
            '% Held by Insiders': 'perc_held_by_insiders',
            'Revenue Per Share': 'revenue_per_share',
            'Short Ratio': 'short_ratio',
            'Shares Short (prior month)': 'shares_short_prior_month',
            'Short % of Float': 'short_perc_float',
            'Profit Margin': 'profit_margin',
            'Return on Assets': 'ro_assets',
            'Price/Sales': 'price_sales',
            'Gross Profit': 'gross_profit',
            'Book Value Per Share': 'book_value_per_share',
            'Levered Free Cash Flow': 'levered_free_cash_flow',
            'Trailing Annual Dividend Rate': 'trailing_annual_dividend_rate',
            'Diluted EPS': 'diluted_eps',
            'PEG Ratio (5 yr expected)': 'peg_ratio_5yr',
            'Shares Outstanding': 'shares_outstanding',
            'Revenue': 'revenue',
            'Float': 'float',
            'Net Income Avi to Common': 'net_income_avi_common',
            'Enterprise Value/EBITDA': 'enterprise_value_ebitda',
            '52-Week Change': 'fifty_two_week_change',
            'Quarterly Revenue Growth': 'q_revenue_growth',
            'Total Cash Per Share': 'total_cash_ps'
        }
        self.convert_dict = {'K': 10**3, 'M': 10**6, 'B': 10**9, 'T': 10**12}
        self.condensed_pat = re.compile("([+-]?\d+\.?\d*)([kmbtKMBT])")
        self.float_pat = re.compile("[+-]?\d+\.\d+")
        self.parenthese_pat = re.compile(" *\(([^)]*)\)")
        self.date_line_pat = re.compile("\(as of (\d+.*\d+)\)")
        self.url_ticker_pat = re.compile(".*/quote/(.*)\.(.*)/key-statistics")
        self.keywords = set({"mrq", "ttm", "yoy", "lfy", "fye"})
        self.today = datetime.today().date()
        self.default_fye = datetime(self.today.year, 12, 31)
        self.db = Database()

    def s2n(self, string):
        reg = self.condensed_pat.search(string)
        return int(
            float(reg.group(1)) * self.convert_dict[reg.group(2).upper()])

    def s2p(self, string):
        return float(string.strip('%'))

    def s2r(self, string):
        split = string.split(':')
        return float(split[0]) / float(split[1])

    def parse_numeric(self, string):
        try:
            if '%' in string:
                return self.s2p(string)
            elif self.condensed_pat.match(string) is not None:
                return self.s2n(string)
            elif self.float_pat.match(string) is not None:
                return float(string)
            elif string.isdigit():
                return int(string)
            elif ':' in string:
                return self.s2r(string)
            else:
                return dp.parse(string).date().isoformat()
        except ValueError:
            return None

    def cleanse_str(self, string):
        return self.parenthese_pat.sub('', string.replace(',', '')).strip(':')

    def dic_parse(self, db, url, html):
        def innerHtml(ele):
            return ele.decode_contents(formatter="html")

        soup = BeautifulSoup(html, "lxml")
        ticker = self.url_ticker_pat.search(url).group(1)
        exchange = "TSX"

        on_yahoo = soup.find('div', attrs={'data-test': 'unknown-quote'
                                           }) is None
        db.update("listings", ["onyahoo"], [on_yahoo],
                  "exchange=%s AND ticker=%s", [exchange, ticker])

        if not on_yahoo:  # if quote not found, exit
            LOGGER.error("Failed to find quote for", url, "skipping")
            return

        div_test = soup.find('div', attrs={'data-test': 'qsp-statistics'})
        if div_test is None:
            LOGGER.error("Unknown error for", url, "skipping")
            return

        db_dic = {}
        for table in div_test.find_all('table'):
            for row in table.find_all('tr'):
                td_list = row.find_all('td')
                title = innerHtml(td_list[0].find('span'))
                val = innerHtml(td_list[1]) if td_list[1].find(
                    'span') is None else innerHtml(td_list[1].find('span'))
                if title in self.y_to_db_map:
                    db_dic[self.y_to_db_map[title]] = self.parse_numeric(val)

        if db_dic:
            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange
            col_names, vals = list(db_dic.keys()), list(db_dic.values())
            where = db.create_conditional_string(col_names)
            if db.exists("key_statistics", where, vals):
                LOGGER.info("Skipping {} due to prior existence".format(url))
            else:
                col_names.append("update_date")
                vals.append(self.today)
                db.insert_into("key_statistics",
                               col_names,
                               vals,
                               multiple=False)
                LOGGER.info("Done parsing {}".format(url))
        else:
            LOGGER.info("Skipping {}".format(url))

    def fetch_all(self, exchange):
        listings = self.db.select(
            "ticker",
            "listings",
            where="exchange = %s AND (onyahoo=TRUE OR onyahoo IS NULL)",
            vals=[exchange],
            unroll=True)
        extension = '.TO'
        urls = [
            "https://ca.finance.yahoo.com/quote/{}{}/key-statistics".format(
                ticker, extension) for ticker in listings
        ]

        xpath_hooks = [
            "//div[@data-test='qsp-statistics']",
            "//div[@data-test='unknown-quote']"
        ]
        jsps = JSPageScraper(self.dic_parse, xpath_hooks, "key_statistics")
        jsps.go(urls)

    def clean_exit(self):
        self.db.destroy()
Пример #8
0
 def __init__(self):
     self.db = Database()
Пример #9
0
 def __init__(self, cache=True, url="http://www.tsx.com/resource/en/571"):
     self.today = datetime.today().date()
     self.db = Database()
     self.url = url
     self.cache_path = os.path.join(os.getcwd(), "cache", "TSX",
                                    "listings") if cache else ""
Пример #10
0
class ListManager():
    def __init__(self, cache=True, url="http://www.tsx.com/resource/en/571"):
        self.today = datetime.today().date()
        self.db = Database()
        self.url = url
        self.cache_path = os.path.join(os.getcwd(), "cache", "TSX",
                                       "listings") if cache else ""

    def write_cache(self, date, sheet):
        os.makedirs(self.cache_path, exist_ok=True)
        json_name = date.strftime('TSX-%Y-%m-%d.json')
        full_path = os.path.join(self.cache_path, json_name)
        sheet.to_json(full_path, orient="records")
        LOGGER.info("Wrote file to {}".format(full_path))

    def get_quotes(self):
        """
        This function gets the tickers and various other random information
        from the TSX website from a hardcoded file and inserts it into the database
        """
        recent_date = self.db.select("MAX(updatedate)",
                                     "listings",
                                     fetch="one",
                                     unroll=True)

        if self.url.startswith("http"):
            req = create_url_request(self.url)
            self.url = urllib.request.urlopen(req)

        sheet = pd.read_excel(self.url,
                              skiprows=5,
                              header=1,
                              keep_default_na=False)
        sheet.fillna('', inplace=True)
        sheet.rename(columns=self.cleanse_str, inplace=True)

        file_date = self.find_date_in_list(list(sheet.columns.values))

        if recent_date is None or (file_date > recent_date):
            xlsx_dict = sheet.to_dict(orient="records")
            recent_date = file_date
            if self.cache_path:
                self.write_cache(recent_date, sheet)
        else:
            LOGGER.info("Already up to date")
            return

        row_names = [
            "updatedate",
            "ticker",
            "exchange",
            "name",
            "sector",
            "osshares",
            "dateoflisting",
            "listingtype",
            "volume",
            "value",
        ]

        all_excel_names = tuple(xlsx_dict[0].keys())
        base_wanted_excel_names = [
            "Root Ticker",
            "Exchange",
            "Name",
            "Sector",
            "O/S",
            "Date of TSX Listing",
            "Listing Type",
            "Volume YTD",
            "Value (C$)",
        ]
        types = [
            "str",
            "str",
            "str",
            "str",
            "int",
            "date",
            "str",
            "int",
            "int",
        ]

        wanted_excel_names = []
        for bxn in base_wanted_excel_names:
            for xn in all_excel_names:
                if xn.startswith(bxn):
                    wanted_excel_names.append(xn)
                    break

        num_rows = len(wanted_excel_names)
        table_name = "listings"
        values = []
        for row in xlsx_dict:
            value_list = [recent_date]
            for i in range(num_rows):
                excel_name = wanted_excel_names[i]
                val = row[excel_name]
                if types[i] == "date":
                    val = datetime.strptime(str(val),
                                            "%Y%m%d")  # assume YYYYMMDD
                value_list.append(val)
            values.append(value_list)

        self.db.insert_into(table_name, row_names, values)

    def get_historic_events(self):
        """
        Gets all the historical events from yahoo, updating only the new entries
        based on the date of the last fetch.
        """
        exchange = "TSX"
        listings = self.db.select("ticker, dateoflisting",
                                  "listings",
                                  where="exchange = %s",
                                  vals=[exchange])
        dict_fields = ["index", "action", "value"]
        fields = ["exchange", "ticker", "date", "action", "value"]

        total_listings = len(listings)
        for counter, (ticker, listdate) in enumerate(listings):
            lastdate = self.db.select("MAX(date)",
                                      "event_history",
                                      fetch="one",
                                      where="exchange = %s AND ticker = %s",
                                      vals=[exchange, ticker],
                                      unroll=True)
            startdate = listdate if lastdate is None else lastdate + timedelta(
                days=1)

            event_dict = []
            if startdate < self.today:
                yahoo_ticker = ticker + ".TO"
                event_dict = self.ticker_event_history(startdate, self.today,
                                                       yahoo_ticker)

            if event_dict:
                LOGGER.info("{}/{} Inserting {} from {} to {}".format(
                    counter, total_listings, ticker, startdate, self.today))
                rows = [[exchange, ticker] + [row[k] for k in dict_fields]
                        for row in event_dict]
                self.db.insert_into("event_history", fields, rows)
            else:
                LOGGER.info("{}/{} Skipping ticker {}".format(
                    counter, total_listings, ticker))

    def get_historic_prices(self):
        """
        Gets all the historical prices from yahoo, updating only the new entries
        based on the date of the last fetch.
        """

        exchange = "TSX"
        listings = self.db.select("ticker, dateoflisting",
                                  "listings",
                                  where="exchange = %s",
                                  vals=[exchange])
        dict_fields = ["Adj Close", "High", "Close", "Open", "Low", "Date"]
        fields = ["exchange", "ticker"] + [x.lower() for x in dict_fields]

        total_listings = len(listings)
        for counter, (ticker, listdate) in enumerate(listings):
            lastdate = self.db.select("MAX(date)",
                                      "price_history",
                                      fetch="one",
                                      where="exchange = %s AND ticker = %s",
                                      vals=[exchange, ticker],
                                      unroll=True)
            startdate = listdate if lastdate is None else lastdate + timedelta(
                days=1)

            his_dict = []
            if startdate < self.today:
                yahoo_ticker = ticker + ".TO"
                his_dict = self.ticker_price_history(startdate, self.today,
                                                     yahoo_ticker)

            if his_dict:
                LOGGER.info("{}/{} Inserting {} from {} to {}".format(
                    counter, total_listings, ticker, startdate, self.today))
                rows = [[exchange, ticker] + [row[k] for k in dict_fields]
                        for row in his_dict]
                self.db.insert_into("price_history", fields, rows)
            else:
                LOGGER.info("{}/{} Skipping ticker {}".format(
                    counter, total_listings, ticker))

    def cleanse_str(self, raw_str):
        return raw_str.replace('\n', ' ').replace("  ", ' ')

    def find_date_in_list(self, strings):
        """
        Returns the first date that occurs in a list of string
        or the current date if none are detected.
        """
        cur_date = self.today  # default = cur. date
        for s in strings:
            try:
                temp_date = dparser.parse(s, fuzzy=True).date()
            except ValueError:
                continue

            if cur_date != temp_date:
                cur_date = temp_date
                break
        return cur_date

    def ticker_price_history(self, start, end, ticker):
        """
        Gets and returns the historic prices for a given ticker for between
        the time period provided. Inclusive.
        """
        try:
            dr = DataReader(ticker, 'yahoo', start, end).reset_index()
        except OSError:
            return []  # if the page cannot be reached for some reason

        return dr.to_dict(orient="records")

    def ticker_event_history(self, start, end, ticker):
        try:
            dr = DataReader(ticker, 'yahoo-actions', start, end).reset_index()
        except OSError:
            return []  # if the page cannot be reached for some reason

        return dr.to_dict(orient="records")

    def clean_exit(self):
        self.db.destroy()
Пример #11
0
 def __init__(self, normalize=True):
     self.db = Database()
     self.ms_financials_cols = self.db.get_col_names("ms_financials")
     self.not_feature_cols = {'ticker', 'exchange', 'update_date', 'fiscal_year', 'period'}
     self.ms_financials_cols = [x for x in self.ms_financials_cols if x not in self.not_feature_cols]
     self.ms_financials_cols_str = ','.join(self.ms_financials_cols)
Пример #12
0
    def __init__(self):
        self.db = Database()
        self.today = datetime.today().date()
        self.ttm_string = self.most_recent_quarter()
        self.headers = {
            'User-Agent':
            "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
        }
        self.exchange_map = {
            "XTSE": "TSX",
        }

        self.year_month_cols = set(
            {"fiscal_year", "margin_date", "profitability_date"})

        self.column_key_map = tuple((
            ("revenue", "revenue"),
            ("gross margin", "gross_margin"),
            ("operating income", "operating_income"),
            ("operating margin", "operating_margin"),
            ("net income", "net_income"),
            ("earnings per share", "eps"),
            ("dividends", "dividends"),
            ("payout ratio", "payout_ratio"),
            ("shares", "num_shares"),
            ("book value per", "book_value_ps"),
            ("operating cash flow", "operating_cash_flow"),
            ("cap spending", "cap_spending"),
            ("cf free cash flow growth", "free_cash_flow_growth_yoy"),
            ("cf free cash flow/sales", "free_cash_flow_sales"),
            ("cf free cash flow/net", "free_cash_flow_net_income"),
            ("free cash flow per share", "free_cash_flow_ps"),
            ("free cash flow", "free_cash_flow"),
            ("working capital", "working_captial"),
            ("pro margins %", "margin_date"),
            ("pro revenue", "revenue_per_sales"),
            ("pro cogs", "revenue_per_cogs"),
            ("pro gross margin", "sales_gross_margin"),
            ("pro sg&a", "margin_sga"),
            ("pro r&d", "margin_rd"),
            ("pro other", "margin_other"),
            ("pro operating margin", "margin_operating"),
            ("pro net int inc", "margin_net_income"),
            ("pro ebt margin", "margin_ebt"),
            ("pro profitability", "profitability_date"),
            ("pro tax rate", "tax_rate"),
            ("pro net margin", "net_margin_perc"),
            ("pro asset turnover", "asset_turnover"),
            ("pro return on assets", "ro_assets"),
            ("pro financial lever", "financial_leverage"),
            ("pro return on equity", "ro_equity"),
            ("pro return on invested capital", "ro_invested_captial"),
            ("pro interest coverage", "interest_coverage"),
            ("r% year over year", "revenue_perc_yoy"),
            ("r% 3-year", "revenue_perc_3y"),
            ("r% 5-year", "revenue_perc_5y"),
            ("r% 10-year", "revenue_perc_10y"),
            ("oi% year over year", "operating_income_yoy"),
            ("oi% 3-year", "operating_income_3y"),
            ("oi% 5-year", "operating_income_5y"),
            ("oi% 10-year", "operating_income_10y"),
            ("ni% year over year", "net_income_yoy"),
            ("ni% 3-year", "net_income_3y"),
            ("ni% 5-year", "net_income_5y"),
            ("ni% 10-year", "net_income_10y"),
            ("eps% year over year", "eps_yoy"),
            ("eps% 3-year", "eps_3y"),
            ("eps% 5-year", "eps_5y"),
            ("eps% 10-year", "eps_10y"),
            ("cf operating cash flow", "cash_flow_operating_growth_yoy"),
            ("cf cap ex", "cap_expense_perc_sales"),
            ("fh cash & short", "cash_short_term"),
            ("fh accounts receivable", "accounts_receivable"),
            ("fh inventory", "inventory"),
            ("fh other current assets", "other_cur_assets"),
            ("fh total current assets", "total_cur_assets"),
            ("fh net pp&e", "net_ppe"),
            ("fh intangibles", "intangibles"),
            ("fh other long-term assets", "other_long_term_assets"),
            ("fh accounts payable", "accounts_payable"),
            ("fh short-term debt", "short_term_debt"),
            ("fh taxes payable", "taxes_payable"),
            ("fh accured liabilities", "accured_liabilities"),
            ("fh other short-term liabilities", "short_term_liabilities"),
            ("fh long-term debt", "long_term_debt"),
            ("fh total liabilities & equity", "total_liabilities_equity"),
            ("fh total liabilities", "total_liabilities"),
            ("fh total stockholder", "total_stockholder"),
            ("fh current ratio", "current_ratio"),
            ("fh quick ratio", "quick_ratio"),
            ("fh debt/equity", "debt_equity"),
            ("er receivables turnover", "receivables_turnover"),
            ("er inventory turnover", "inventory_turnover"),
            ("er fixed assets turnover", "fixed_assets_turnover"),
        ))
        self.column_financials_map = tuple((
            ("fiscal year", "fiscal_year"),
            ("revenue", "revenue"),
            ("cost of revenue", "revenue_cost"),
            ("gross profit", "gross_profit"),
            ("sales, general and administrative", "sales_expense"),
            ("other operating", "operating_expense"),
            ("other assets", "other_assets"),
            ("operating income", "operating_income"),
            ("interest expense", "intrest_expense"),
            ("total operating expense", "total_costs"),
            ("total costs and expenses", "total_costs"),
            ("preferred dividend", "preferred_dividend"),
            ("income before", "income_before_taxes"),
            ("provision for", "provision_taxes"),
            ("net income from continuing op", "net_income_continuing_ops"),
            ("net income from discontinuing ops",
             "net_income_discontinuing_ops"),
            ("net income available to common shareholders",
             "net_income_common"),
            ("net income", "net_income"),
            ("eps basic", "eps_basic"),
            ("eps diluted", "eps_diluted"),
            ("waso basic", "waso_basic"),
            ("waso diluted", "waso_diluted"),
            ("ebitda", "ebitda"),
        ))

        self.special_key_titles = tuple((
            ("key ratios -> profitability", "pro "),
            ("key ratios -> growth", "gro "),
            ("key ratios -> cash flow", "cf "),
            ("key ratios -> financial health", "fh "),
            ("key ratios -> efficiency ratios", "er "),
            ("revenue %", "r% "),
            ("operating income %", "oi% "),
            ("net income %", "ni% "),
            ("eps %", "eps% "),
        ))
        self.special_financials_titles = tuple((
            ("earnings per share", "eps "),
            ("weighted average shares outstanding", "waso "),
        ))

        self.translation_table = dict.fromkeys(map(ord, '",'), None)
Пример #13
0
class MorningStarScaper():
    def __init__(self):
        self.db = Database()
        self.today = datetime.today().date()
        self.ttm_string = self.most_recent_quarter()
        self.headers = {
            'User-Agent':
            "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
        }
        self.exchange_map = {
            "XTSE": "TSX",
        }

        self.year_month_cols = set(
            {"fiscal_year", "margin_date", "profitability_date"})

        self.column_key_map = tuple((
            ("revenue", "revenue"),
            ("gross margin", "gross_margin"),
            ("operating income", "operating_income"),
            ("operating margin", "operating_margin"),
            ("net income", "net_income"),
            ("earnings per share", "eps"),
            ("dividends", "dividends"),
            ("payout ratio", "payout_ratio"),
            ("shares", "num_shares"),
            ("book value per", "book_value_ps"),
            ("operating cash flow", "operating_cash_flow"),
            ("cap spending", "cap_spending"),
            ("cf free cash flow growth", "free_cash_flow_growth_yoy"),
            ("cf free cash flow/sales", "free_cash_flow_sales"),
            ("cf free cash flow/net", "free_cash_flow_net_income"),
            ("free cash flow per share", "free_cash_flow_ps"),
            ("free cash flow", "free_cash_flow"),
            ("working capital", "working_captial"),
            ("pro margins %", "margin_date"),
            ("pro revenue", "revenue_per_sales"),
            ("pro cogs", "revenue_per_cogs"),
            ("pro gross margin", "sales_gross_margin"),
            ("pro sg&a", "margin_sga"),
            ("pro r&d", "margin_rd"),
            ("pro other", "margin_other"),
            ("pro operating margin", "margin_operating"),
            ("pro net int inc", "margin_net_income"),
            ("pro ebt margin", "margin_ebt"),
            ("pro profitability", "profitability_date"),
            ("pro tax rate", "tax_rate"),
            ("pro net margin", "net_margin_perc"),
            ("pro asset turnover", "asset_turnover"),
            ("pro return on assets", "ro_assets"),
            ("pro financial lever", "financial_leverage"),
            ("pro return on equity", "ro_equity"),
            ("pro return on invested capital", "ro_invested_captial"),
            ("pro interest coverage", "interest_coverage"),
            ("r% year over year", "revenue_perc_yoy"),
            ("r% 3-year", "revenue_perc_3y"),
            ("r% 5-year", "revenue_perc_5y"),
            ("r% 10-year", "revenue_perc_10y"),
            ("oi% year over year", "operating_income_yoy"),
            ("oi% 3-year", "operating_income_3y"),
            ("oi% 5-year", "operating_income_5y"),
            ("oi% 10-year", "operating_income_10y"),
            ("ni% year over year", "net_income_yoy"),
            ("ni% 3-year", "net_income_3y"),
            ("ni% 5-year", "net_income_5y"),
            ("ni% 10-year", "net_income_10y"),
            ("eps% year over year", "eps_yoy"),
            ("eps% 3-year", "eps_3y"),
            ("eps% 5-year", "eps_5y"),
            ("eps% 10-year", "eps_10y"),
            ("cf operating cash flow", "cash_flow_operating_growth_yoy"),
            ("cf cap ex", "cap_expense_perc_sales"),
            ("fh cash & short", "cash_short_term"),
            ("fh accounts receivable", "accounts_receivable"),
            ("fh inventory", "inventory"),
            ("fh other current assets", "other_cur_assets"),
            ("fh total current assets", "total_cur_assets"),
            ("fh net pp&e", "net_ppe"),
            ("fh intangibles", "intangibles"),
            ("fh other long-term assets", "other_long_term_assets"),
            ("fh accounts payable", "accounts_payable"),
            ("fh short-term debt", "short_term_debt"),
            ("fh taxes payable", "taxes_payable"),
            ("fh accured liabilities", "accured_liabilities"),
            ("fh other short-term liabilities", "short_term_liabilities"),
            ("fh long-term debt", "long_term_debt"),
            ("fh total liabilities & equity", "total_liabilities_equity"),
            ("fh total liabilities", "total_liabilities"),
            ("fh total stockholder", "total_stockholder"),
            ("fh current ratio", "current_ratio"),
            ("fh quick ratio", "quick_ratio"),
            ("fh debt/equity", "debt_equity"),
            ("er receivables turnover", "receivables_turnover"),
            ("er inventory turnover", "inventory_turnover"),
            ("er fixed assets turnover", "fixed_assets_turnover"),
        ))
        self.column_financials_map = tuple((
            ("fiscal year", "fiscal_year"),
            ("revenue", "revenue"),
            ("cost of revenue", "revenue_cost"),
            ("gross profit", "gross_profit"),
            ("sales, general and administrative", "sales_expense"),
            ("other operating", "operating_expense"),
            ("other assets", "other_assets"),
            ("operating income", "operating_income"),
            ("interest expense", "intrest_expense"),
            ("total operating expense", "total_costs"),
            ("total costs and expenses", "total_costs"),
            ("preferred dividend", "preferred_dividend"),
            ("income before", "income_before_taxes"),
            ("provision for", "provision_taxes"),
            ("net income from continuing op", "net_income_continuing_ops"),
            ("net income from discontinuing ops",
             "net_income_discontinuing_ops"),
            ("net income available to common shareholders",
             "net_income_common"),
            ("net income", "net_income"),
            ("eps basic", "eps_basic"),
            ("eps diluted", "eps_diluted"),
            ("waso basic", "waso_basic"),
            ("waso diluted", "waso_diluted"),
            ("ebitda", "ebitda"),
        ))

        self.special_key_titles = tuple((
            ("key ratios -> profitability", "pro "),
            ("key ratios -> growth", "gro "),
            ("key ratios -> cash flow", "cf "),
            ("key ratios -> financial health", "fh "),
            ("key ratios -> efficiency ratios", "er "),
            ("revenue %", "r% "),
            ("operating income %", "oi% "),
            ("net income %", "ni% "),
            ("eps %", "eps% "),
        ))
        self.special_financials_titles = tuple((
            ("earnings per share", "eps "),
            ("weighted average shares outstanding", "waso "),
        ))

        self.translation_table = dict.fromkeys(map(ord, '",'), None)

    def most_recent_quarter(self):
        day = self.today.day
        quarter = (self.today.month - 1) // 3
        year = self.today.year
        month = quarter * 3 + 1
        return datetime(year=year, month=month, day=1).date().isoformat()

    def find_column(self, col, mapper, subtitle=''):
        col = col.lower().replace('"', '')
        wst = subtitle + col
        alt = ''

        for k, v in mapper:
            if wst.startswith(k):
                return v
            elif col.startswith(k):
                alt = v

        return alt

    def convert_numerical(self, n):
        try:
            return int(n)
        except ValueError:
            try:
                return float(n)
            except ValueError:
                return n

    def get_title_multiplier(self, title):
        multipliers = ["Ths", "Mil", "Bil"]
        factors = [10**3, 10**6, 10**9]
        for i, multi in enumerate(multipliers):
            if title.endswith(multi):
                return factors[i]

        return 1

    def parse_csv(self,
                  csv_r,
                  num_cols,
                  special_titles,
                  column_map,
                  extra_cols=[],
                  extra_vals=[]):
        subhead = ''
        next(csv_r)  # skip header

        col_names = extra_cols
        vals = [list(extra_vals)
                for _ in range(num_cols)]  # Don't do, [[]] * 6
        for cols in csv_r:
            row_cols = len(cols)
            if row_cols == 0:
                continue
            elif row_cols == 1:
                subhead = self.find_column(cols[0], special_titles)
            else:
                db_col = self.find_column(cols[0],
                                          column_map,
                                          subtitle=subhead)
                if db_col:
                    multi = self.get_title_multiplier(cols[0])
                    col_names.append(db_col)
                    for i in range(num_cols):
                        cell = cols[i + 1].translate(self.translation_table)
                        val = self.convert_numerical(
                            cell) * multi if cell else None
                        if db_col in self.year_month_cols:
                            val = self.ttm_string if val == 'TTM' else datetime.strptime(
                                val, '%Y-%m')

                        vals[i].append(val)

        return col_names, vals

    def get_key_stats(self, ticker, db_exchange="TSX"):
        """
        This function get key statistics from
        Morning Star.
        """
        url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&"
               "culture=en-CA&region=CAN&order=asc&r={}").format(
                   ticker, randint(1, 500000))
        req = urllib.request.Request(url, headers=self.headers)
        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        on_morningstar = csv_r and resp.headers['content-length'] != '0'
        if on_morningstar:
            LOGGER.info("Getting key statistics for {}... ".format(ticker))
        else:
            LOGGER.info("Skipping", ticker)
            return 1

        col_names, vals = self.parse_csv(
            csv_r,
            10,
            self.special_key_titles,
            self.column_key_map,
            extra_cols=["ticker", "exchange", "update_date"],
            extra_vals=[ticker, db_exchange, self.today])

        self.db.insert_into("ms_key_statistics",
                            col_names,
                            vals,
                            unique_conflict=True)
        LOGGER.info("Done")
        return 0

    def get_financial(self, ticker, period_name, exchange="XTSE"):
        """
        This function get yearly and quartly information from
        Morning Star.
        
        period_name: "quarter" or "annual"
        exchanges: XTSE (TSX),
        """

        # this converts the morning star exchange name to our database name
        if exchange in self.exchange_map:
            db_exchange = self.exchange_map[exchange]
        else:
            raise ValueError("Exchange unsupported {}".format(exchange))

        period = 3 if period_name == "quarter" else 12

        url = (
            "http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t="
            "{}:{}&region=can&culture=en-US&cur=&reportType=is&period={}&"
            "dataType=A&order=desc&columnYear=5&curYearPart=1st5year&"
            "rounding=1&view=raw&r={}&denominatorView=raw&number=1").format(
                exchange, ticker, period, randint(1, 500000))
        req = urllib.request.Request(url, headers=self.headers)

        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        on_morningstar = csv_r and resp.headers['content-length'] != '0'

        if on_morningstar:
            LOGGER.info("Getting {} financial data for {}... ".format(
                period_name, ticker),
                        end='')
        else:
            LOGGER.info("Skipping", ticker)
            return 1

        num_cols = 6 if period_name == "quarter" else 5  # skip last column if not quarter view (removes TTM)
        col_names, vals = self.parse_csv(
            csv_r,
            num_cols,
            self.special_financials_titles,
            self.column_financials_map,
            extra_cols=["ticker", "exchange", "period", "update_date"],
            extra_vals=[ticker, db_exchange, period, self.today])

        self.db.insert_into("ms_financials",
                            col_names,
                            vals,
                            unique_conflict=True)
        LOGGER.info("Done")
        return 0

    def fetch_all(self, db_exchange):
        listings = self.db.select(
            "ticker",
            "listings",
            where="exchange = %s AND (onms=TRUE OR onms IS NULL)",
            vals=[db_exchange],
            unroll=True)
        for ticker in listings:
            found1 = mss.get_key_stats(ticker)
            found2 = mss.get_financial(ticker, "quarter")
            found3 = mss.get_financia(ticker, "annual")
            on_morningstar = not (found1 and found2 and found3
                                  )  # if the statistics or the financial data
            self.db.update("listings", ["onms"], [on_morningstar],
                           "exchange=%s AND ticker=%s", [db_exchange, ticker])

    def clean_exit(self):
        self.db.destroy()
Пример #14
0
 def __init__(self, normalize=True):
     self.db = Database()
     self.ms_financials_cols = self.db.get_col_names("ms_financials")