Exemplo n.º 1
0
def get_html(urlQ, callback, xpath_hooks):
    """
    This page takes a url from the URL Queue (urlQ) and
    calls a callbac that will handle the page source.

    xpage_hooks is a list used to determine when the page is loaded,
    see the docs for more details (e.g. ["//div[@data-test='whatever']"] ).
    """
    svr = webkit_server.Server()
    svrconn = webkit_server.ServerConnection(server=svr)
    driver = dryscrape.driver.webkit.Driver(connection=svrconn)

    sess = dryscrape.Session(driver=driver)
    sess.set_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
    )
    sess.set_attribute("auto_load_images", False)

    valid_page_func = lambda: any(
        sess.at_xpath(xpath) for xpath in xpath_hooks)
    session = Session()

    while not urlQ.empty():
        url = urlQ.get()

        try:
            sess.visit(url)
        except webkit_server.InvalidResponseError:
            LOGGER.error(
                "Got invalid response from something? Skipping {}".format(url))
            continue

        try:
            sess.wait_for(valid_page_func, interval=1, timeout=15)
        except dryscrape.mixins.WaitTimeoutError:
            LOGGER.error("Timeout so skipping {}".format(url))
            continue

        response = sess.body()
        callback(session, url, response)
        sess.reset()

    svr.kill()
    session.close()
Exemplo n.º 2
0
    def __init__(self):
        # This is a reverse engineering of the Yahoo Finance REST API
        # Information off: http://www.jarloo.com/yahoo_finance/
        self.y_to_db_map = {'n': 'name', 'y': 'dividend_yield', 'd': 'dividend_ps',
                            'r': 'pe', 'r1': 'dividend_pay_date', 'q': 'ex_dividend_date',
                            'o': 'open', 'c1': 'change', 'p2': 'perc_change', 'd1': 'last_trade_date',
                            'd2': 'trade_date', 'c3': 'commission', 'g': 'day_low', 'h': 'day_high',
                            'p': 'previous_close', 't8': 'year_target', 'm5': 'change_mv_avg_200',
                            'm6': 'perc_change_mv_avg_200', 'm7': 'change_mv_avg_50', 'm8': 'perc_change_mv_avg_50',
                            'm3': 'mv_avg_50', 'm4': 'mv_avg_200', 'w1': 'day_value_change',
                            'g1': 'holding_gain_perc', 'g3': 'annualized_gain', 'g4': 'holdings_gain',
                            'k': 'high_52_week', 'j': 'low_52_week', 'j5': 'change_52_week_low',
                            'k4': 'change_52_week_high', 'j6': 'perc_change_52_week_low',
                            'k5': 'perc_change_52_week_high', 'j1': 'market_cap',
                            'f6': 'float_shares', 'x': 'stock_exchange', 's1': 'shares_owned',
                            'j2': 'shares_outstanding', 'n4': 'notes', 'i': 'more_info',
                            'v': 'volume', 'a2': 'avg_daily_volume', 'e': 'eps', 'e7': 'eps_year_estimate',
                            'e8': 'eps_next_year_estimate', 'e9': 'eps_next_q_estimate', 'b4': 'book',
                            'j4': 'ebitda', 'p5': 'price_sale', 'p6': 'price_book', 'r': 'pe', 'r5': 'peg',
                            'r6': 'price_eps_estimate_year', 'r7': 'price_eps_estimate_next_year', 's7': 'short_ratio',
                            's6': 'revenue', 'v1': 'holdings_val', 'l2': 'high_limit', 'l3': 'low_limit',
                            'a': 'ask', 'b': 'bid'}
        self.convert_dict = {'K': 10 ** 3, 'M': 10 ** 6, 'B': 10 ** 9, 'T': 10 ** 12}
        self.condensed_pat = re.compile("([+-]?\d*\.?\d+)([kmbtKMBT])")
        self.url_flags = tuple(self.y_to_db_map.keys())
        self.url_str_flags = "".join(self.url_flags)
        self.db_entries = tuple(self.y_to_db_map.values())
        self.float_pat = re.compile("[+-]?(\d*[\.])?\d+$")
        self.today = datetime.today().date()
        self.base_url = "http://finance.yahoo.com/d/quotes.csv"

        # Sometimes websites are friendlier to iOS devices :)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
        }
        self.session = Session()
Exemplo n.º 3
0
    def __init__(self):
        self.session = Session()
        self.today = datetime.today().date()
        self.ttm_string = self.most_recent_quarter()
        self.headers = {
            'User-Agent':
            "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
        }
        self.exchange_map = {
            "XTSE": "TSX",
        }

        self.year_month_cols = set(
            {"fiscal_year", "margin_date", "profitability_date"})

        self.column_key_map = tuple((
            ("revenue", "revenue"),
            ("gross margin", "gross_margin"),
            ("operating income", "operating_income"),
            ("operating margin", "operating_margin"),
            ("net income", "net_income"),
            ("earnings per share", "eps"),
            ("dividends", "dividends"),
            ("payout ratio", "payout_ratio"),
            ("shares", "num_shares"),
            ("book value per", "book_value_ps"),
            ("operating cash flow", "operating_cash_flow"),
            ("cap spending", "cap_spending"),
            ("cf free cash flow growth", "free_cash_flow_growth_yoy"),
            ("cf free cash flow/sales", "free_cash_flow_sales"),
            ("cf free cash flow/net", "free_cash_flow_net_income"),
            ("free cash flow per share", "free_cash_flow_ps"),
            ("free cash flow", "free_cash_flow"),
            ("working capital", "working_captial"),
            ("pro margins %", "margin_date"),
            ("pro revenue", "revenue_per_sales"),
            ("pro cogs", "revenue_per_cogs"),
            ("pro gross margin", "sales_gross_margin"),
            ("pro sg&a", "margin_sga"),
            ("pro r&d", "margin_rd"),
            ("pro other", "margin_other"),
            ("pro operating margin", "margin_operating"),
            ("pro net int inc", "margin_net_income"),
            ("pro ebt margin", "margin_ebt"),
            ("pro profitability", "profitability_date"),
            ("pro tax rate", "tax_rate"),
            ("pro net margin", "net_margin_perc"),
            ("pro asset turnover", "asset_turnover"),
            ("pro return on assets", "ro_assets"),
            ("pro financial lever", "financial_leverage"),
            ("pro return on equity", "ro_equity"),
            ("pro return on invested capital", "ro_invested_captial"),
            ("pro interest coverage", "interest_coverage"),
            ("r% year over year", "revenue_perc_yoy"),
            ("r% 3-year", "revenue_perc_3y"),
            ("r% 5-year", "revenue_perc_5y"),
            ("r% 10-year", "revenue_perc_10y"),
            ("oi% year over year", "operating_income_yoy"),
            ("oi% 3-year", "operating_income_3y"),
            ("oi% 5-year", "operating_income_5y"),
            ("oi% 10-year", "operating_income_10y"),
            ("ni% year over year", "net_income_yoy"),
            ("ni% 3-year", "net_income_3y"),
            ("ni% 5-year", "net_income_5y"),
            ("ni% 10-year", "net_income_10y"),
            ("eps% year over year", "eps_yoy"),
            ("eps% 3-year", "eps_3y"),
            ("eps% 5-year", "eps_5y"),
            ("eps% 10-year", "eps_10y"),
            ("cf operating cash flow", "cash_flow_operating_growth_yoy"),
            ("cf cap ex", "cap_expense_perc_sales"),
            ("fh cash & short", "cash_short_term"),
            ("fh accounts receivable", "accounts_receivable"),
            ("fh inventory", "inventory"),
            ("fh other current assets", "other_cur_assets"),
            ("fh total current assets", "total_cur_assets"),
            ("fh net pp&e", "net_ppe"),
            ("fh intangibles", "intangibles"),
            ("fh other long-term assets", "other_long_term_assets"),
            ("fh accounts payable", "accounts_payable"),
            ("fh short-term debt", "short_term_debt"),
            ("fh taxes payable", "taxes_payable"),
            ("fh accured liabilities", "accured_liabilities"),
            ("fh other short-term liabilities", "short_term_liabilities"),
            ("fh long-term debt", "long_term_debt"),
            ("fh total liabilities & equity", "total_liabilities_equity"),
            ("fh total liabilities", "total_liabilities"),
            ("fh total stockholder", "total_stockholder"),
            ("fh current ratio", "current_ratio"),
            ("fh quick ratio", "quick_ratio"),
            ("fh debt/equity", "debt_equity"),
            ("er receivables turnover", "receivables_turnover"),
            ("er inventory turnover", "inventory_turnover"),
            ("er fixed assets turnover", "fixed_assets_turnover"),
        ))
        self.column_financials_map = tuple((
            ("fiscal year", "fiscal_year"),
            ("revenue", "revenue"),
            ("cost of revenue", "revenue_cost"),
            ("gross profit", "gross_profit"),
            ("sales, general and administrative", "sales_expense"),
            ("other operating", "operating_expense"),
            ("other assets", "other_assets"),
            ("operating income", "operating_income"),
            ("interest expense", "intrest_expense"),
            ("total operating expense", "total_costs"),
            ("total costs and expenses", "total_costs"),
            ("preferred dividend", "preferred_dividend"),
            ("income before", "income_before_taxes"),
            ("provision for", "provision_taxes"),
            ("net income from continuing op", "net_income_continuing_ops"),
            ("net income from discontinuing ops",
             "net_income_discontinuing_ops"),
            ("net income available to common shareholders",
             "net_income_common"),
            ("net income", "net_income"),
            ("eps basic", "eps_basic"),
            ("eps diluted", "eps_diluted"),
            ("waso basic", "waso_basic"),
            ("waso diluted", "waso_diluted"),
            ("ebitda", "ebitda"),
        ))

        self.special_key_titles = tuple((
            ("key ratios -> profitability", "pro "),
            ("key ratios -> growth", "gro "),
            ("key ratios -> cash flow", "cf "),
            ("key ratios -> financial health", "fh "),
            ("key ratios -> efficiency ratios", "er "),
            ("revenue %", "r% "),
            ("operating income %", "oi% "),
            ("net income %", "ni% "),
            ("eps %", "eps% "),
        ))
        self.special_financials_titles = tuple((
            ("earnings per share", "eps "),
            ("weighted average shares outstanding", "waso "),
        ))

        self.translation_table = dict.fromkeys(map(ord, '",'), None)
Exemplo n.º 4
0
class MorningStarScaper():
    def __init__(self):
        self.session = Session()
        self.today = datetime.today().date()
        self.ttm_string = self.most_recent_quarter()
        self.headers = {
            'User-Agent':
            "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
        }
        self.exchange_map = {
            "XTSE": "TSX",
        }

        self.year_month_cols = set(
            {"fiscal_year", "margin_date", "profitability_date"})

        self.column_key_map = tuple((
            ("revenue", "revenue"),
            ("gross margin", "gross_margin"),
            ("operating income", "operating_income"),
            ("operating margin", "operating_margin"),
            ("net income", "net_income"),
            ("earnings per share", "eps"),
            ("dividends", "dividends"),
            ("payout ratio", "payout_ratio"),
            ("shares", "num_shares"),
            ("book value per", "book_value_ps"),
            ("operating cash flow", "operating_cash_flow"),
            ("cap spending", "cap_spending"),
            ("cf free cash flow growth", "free_cash_flow_growth_yoy"),
            ("cf free cash flow/sales", "free_cash_flow_sales"),
            ("cf free cash flow/net", "free_cash_flow_net_income"),
            ("free cash flow per share", "free_cash_flow_ps"),
            ("free cash flow", "free_cash_flow"),
            ("working capital", "working_captial"),
            ("pro margins %", "margin_date"),
            ("pro revenue", "revenue_per_sales"),
            ("pro cogs", "revenue_per_cogs"),
            ("pro gross margin", "sales_gross_margin"),
            ("pro sg&a", "margin_sga"),
            ("pro r&d", "margin_rd"),
            ("pro other", "margin_other"),
            ("pro operating margin", "margin_operating"),
            ("pro net int inc", "margin_net_income"),
            ("pro ebt margin", "margin_ebt"),
            ("pro profitability", "profitability_date"),
            ("pro tax rate", "tax_rate"),
            ("pro net margin", "net_margin_perc"),
            ("pro asset turnover", "asset_turnover"),
            ("pro return on assets", "ro_assets"),
            ("pro financial lever", "financial_leverage"),
            ("pro return on equity", "ro_equity"),
            ("pro return on invested capital", "ro_invested_captial"),
            ("pro interest coverage", "interest_coverage"),
            ("r% year over year", "revenue_perc_yoy"),
            ("r% 3-year", "revenue_perc_3y"),
            ("r% 5-year", "revenue_perc_5y"),
            ("r% 10-year", "revenue_perc_10y"),
            ("oi% year over year", "operating_income_yoy"),
            ("oi% 3-year", "operating_income_3y"),
            ("oi% 5-year", "operating_income_5y"),
            ("oi% 10-year", "operating_income_10y"),
            ("ni% year over year", "net_income_yoy"),
            ("ni% 3-year", "net_income_3y"),
            ("ni% 5-year", "net_income_5y"),
            ("ni% 10-year", "net_income_10y"),
            ("eps% year over year", "eps_yoy"),
            ("eps% 3-year", "eps_3y"),
            ("eps% 5-year", "eps_5y"),
            ("eps% 10-year", "eps_10y"),
            ("cf operating cash flow", "cash_flow_operating_growth_yoy"),
            ("cf cap ex", "cap_expense_perc_sales"),
            ("fh cash & short", "cash_short_term"),
            ("fh accounts receivable", "accounts_receivable"),
            ("fh inventory", "inventory"),
            ("fh other current assets", "other_cur_assets"),
            ("fh total current assets", "total_cur_assets"),
            ("fh net pp&e", "net_ppe"),
            ("fh intangibles", "intangibles"),
            ("fh other long-term assets", "other_long_term_assets"),
            ("fh accounts payable", "accounts_payable"),
            ("fh short-term debt", "short_term_debt"),
            ("fh taxes payable", "taxes_payable"),
            ("fh accured liabilities", "accured_liabilities"),
            ("fh other short-term liabilities", "short_term_liabilities"),
            ("fh long-term debt", "long_term_debt"),
            ("fh total liabilities & equity", "total_liabilities_equity"),
            ("fh total liabilities", "total_liabilities"),
            ("fh total stockholder", "total_stockholder"),
            ("fh current ratio", "current_ratio"),
            ("fh quick ratio", "quick_ratio"),
            ("fh debt/equity", "debt_equity"),
            ("er receivables turnover", "receivables_turnover"),
            ("er inventory turnover", "inventory_turnover"),
            ("er fixed assets turnover", "fixed_assets_turnover"),
        ))
        self.column_financials_map = tuple((
            ("fiscal year", "fiscal_year"),
            ("revenue", "revenue"),
            ("cost of revenue", "revenue_cost"),
            ("gross profit", "gross_profit"),
            ("sales, general and administrative", "sales_expense"),
            ("other operating", "operating_expense"),
            ("other assets", "other_assets"),
            ("operating income", "operating_income"),
            ("interest expense", "intrest_expense"),
            ("total operating expense", "total_costs"),
            ("total costs and expenses", "total_costs"),
            ("preferred dividend", "preferred_dividend"),
            ("income before", "income_before_taxes"),
            ("provision for", "provision_taxes"),
            ("net income from continuing op", "net_income_continuing_ops"),
            ("net income from discontinuing ops",
             "net_income_discontinuing_ops"),
            ("net income available to common shareholders",
             "net_income_common"),
            ("net income", "net_income"),
            ("eps basic", "eps_basic"),
            ("eps diluted", "eps_diluted"),
            ("waso basic", "waso_basic"),
            ("waso diluted", "waso_diluted"),
            ("ebitda", "ebitda"),
        ))

        self.special_key_titles = tuple((
            ("key ratios -> profitability", "pro "),
            ("key ratios -> growth", "gro "),
            ("key ratios -> cash flow", "cf "),
            ("key ratios -> financial health", "fh "),
            ("key ratios -> efficiency ratios", "er "),
            ("revenue %", "r% "),
            ("operating income %", "oi% "),
            ("net income %", "ni% "),
            ("eps %", "eps% "),
        ))
        self.special_financials_titles = tuple((
            ("earnings per share", "eps "),
            ("weighted average shares outstanding", "waso "),
        ))

        self.translation_table = dict.fromkeys(map(ord, '",'), None)

    def most_recent_quarter(self):
        day = self.today.day
        quarter = (self.today.month - 1) // 3
        year = self.today.year
        month = quarter * 3 + 1
        return datetime(year=year, month=month, day=1).date().isoformat()

    def find_column(self, col, mapper, subtitle=''):
        col = col.lower().replace('"', '')
        wst = subtitle + col
        alt = ''

        for k, v in mapper:
            if wst.startswith(k):
                return v
            elif col.startswith(k):
                alt = v

        return alt

    def convert_numerical(self, n):
        try:
            return int(n)
        except ValueError:
            try:
                return float(n)
            except ValueError:
                return n

    def get_title_multiplier(self, title):
        multipliers = ["Ths", "Mil", "Bil"]
        factors = [10**3, 10**6, 10**9]
        for i, multi in enumerate(multipliers):
            if title.endswith(multi):
                return factors[i]

        return 1

    def parse_csv(self,
                  csv_r,
                  num_cols,
                  special_titles,
                  column_map,
                  start_dic={}):
        subhead = ''
        next(csv_r)  # skip headers

        return_dics = []

        for cols in csv_r:
            row_cols = len(cols)
            if row_cols == 0:
                continue
            elif row_cols == 1:
                subhead = self.find_column(cols[0], special_titles)
            else:
                db_col = self.find_column(cols[0],
                                          column_map,
                                          subtitle=subhead)
                if db_col:
                    multi = self.get_title_multiplier(cols[0])

                    if len(return_dics) == 0:
                        return_dics = [
                            start_dic.copy() for _ in range(num_cols)
                        ]

                    for i in range(num_cols):
                        cell = cols[i + 1].translate(self.translation_table)
                        val = self.convert_numerical(
                            cell) * multi if cell else None
                        if db_col in self.year_month_cols:
                            val = self.ttm_string if val == 'TTM' else datetime.strptime(
                                val, '%Y-%m')
                        return_dics[i][db_col] = val
        return return_dics

    def get_key_stats(self, ticker, db_exchange="TSX"):
        """
        This function get key statistics from
        Morning Star.
        """
        url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&"
               "culture=en-CA&region=CAN&order=asc&r={}").format(
                   ticker, randint(1, 500000))
        req = urllib.request.Request(url, headers=self.headers)
        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        on_morningstar = csv_r and resp.headers['content-length'] != '0'
        if on_morningstar:
            LOGGER.info("Getting key statistics for {}... ".format(ticker))
        else:
            LOGGER.info("Skipping {}".format(ticker))
            return 1

        return_dics = self.parse_csv(csv_r,
                                     10,
                                     self.special_key_titles,
                                     self.column_key_map,
                                     start_dic={
                                         "ticker": ticker,
                                         "exchange": db_exchange,
                                         "update_date": self.today
                                     })

        for d in return_dics:
            stmt = insert(MorningStarKeyStatistics).values(
                d).on_conflict_do_update(constraint='ms_key_statistics_pkey',
                                         set_=d)
            self.session.execute(stmt)

        self.session.commit()

        LOGGER.info("Done")
        return 0

    def get_financial(self, ticker, period_name, exchange="XTSE"):
        """
        This function get yearly and quartly information from
        Morning Star.
        
        period_name: "quarter" or "annual"
        exchanges: XTSE (TSX),
        """

        # this converts the morning star exchange name to our database name
        if exchange in self.exchange_map:
            db_exchange = self.exchange_map[exchange]
        else:
            raise ValueError("Exchange unsupported {}".format(exchange))

        period = 3 if period_name == "quarter" else 12

        url = (
            "http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t="
            "{}:{}&region=can&culture=en-US&cur=&reportType=is&period={}&"
            "dataType=A&order=desc&columnYear=5&curYearPart=1st5year&"
            "rounding=1&view=raw&r={}&denominatorView=raw&number=1").format(
                exchange, ticker, period, randint(1, 500000))
        req = urllib.request.Request(url, headers=self.headers)

        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        on_morningstar = csv_r and resp.headers['content-length'] != '0'

        if on_morningstar:
            LOGGER.info("Getting {} financial data for {}... ".format(
                period_name, ticker))
        else:
            LOGGER.info("Skipping {}".format(ticker))
            return 1

        num_cols = 6 if period_name == "quarter" else 5  # skip last column if not quarter view (removes TTM)
        return_dics = self.parse_csv(csv_r,
                                     num_cols,
                                     self.special_financials_titles,
                                     self.column_financials_map,
                                     start_dic={
                                         "ticker": ticker,
                                         "exchange": db_exchange,
                                         "period": period,
                                         "update_date": self.today
                                     })

        for d in return_dics:
            stmt = insert(MorningStarFinancials).values(
                d).on_conflict_do_update(constraint='fiscal_year_unique',
                                         set_=d)
            self.session.execute(stmt)

        self.session.commit()

        LOGGER.info("Done")
        return 0

    def fetch_all(self, db_exchange):
        q = self.session.query(Listings).filter(
            Listings.exchange == db_exchange,
            or_(Listings.onms == True, Listings.onms is None))

        for listing in q:
            ticker = listing.ticker
            found1 = mss.get_key_stats(ticker)
            found2 = mss.get_financial(ticker, "quarter")
            found3 = mss.get_financial(ticker, "annual")
            on_morningstar = not (found1 and found2 and found3
                                  )  # if the statistics or the financial data
            self.session.query(Listings).filter(
                Listings.exchange == db_exchange,
                Listings.ticker == ticker).update(
                    {Listings.onms: on_morningstar})

    def clean_exit(self):
        self.session.close()
Exemplo n.º 5
0
        imp = imp.fit(train_data)

        train_ticker_names = np.array(train_ticker_names, dtype=np.str)
        train_data = imp.transform(train_data)
        train_targets = np.array(train_targets, dtype=np.float)
        test_ticker_names = np.array(test_ticker_names, dtype=np.str)
        test_data = imp.transform(np.array(test_data, dtype=np.float))
        test_targets = np.array(test_targets, dtype=np.float)

        if not os.path.exists(self.dir_path):
            os.makedirs(self.dir_path)

        LOGGER.info("Saving file at: {}".format(self.file_path))

        np.savez(self.file_path,
                 train_data=train_data,
                 train_targets=train_targets,
                 train_ticker_names=train_ticker_names,
                 test_data=test_data,
                 test_targets=test_targets,
                 test_ticker_names=test_ticker_names)


if __name__ == "__main__":
    from sa.database import Session

    sess = Session()
    fc = FeatureHelper(sess)
    fc.generate_and_save_feature_data(independent=False)
    fc.screen_and_save_feature_data()
Exemplo n.º 6
0
 def __init__(self):
     self.sess = Session()
Exemplo n.º 7
0
 def __init__(self, url="http://www.tsx.com/resource/en/571"):
     self.today = datetime.today().date()
     self.session = Session()
     self.url = url
Exemplo n.º 8
0
class ListManager():
    def __init__(self, url="http://www.tsx.com/resource/en/571"):
        self.today = datetime.today().date()
        self.session = Session()
        self.url = url

    def get_quotes(self):
        """
        This function gets the tickers and various other random information
        from the TSX website from a hardcoded file and inserts it into the database
        """
        recent_date, = self.session.query(func.max(
            Listings.updatedate)).first()

        if self.url.startswith("http"):
            req = create_url_request(self.url)
            self.url = urllib.request.urlopen(req)

        sheet = pd.read_excel(self.url,
                              skiprows=5,
                              header=1,
                              keep_default_na=False)
        sheet.fillna('', inplace=True)
        sheet.rename(columns=self.cleanse_str, inplace=True)

        file_date = self.find_date_in_list(list(sheet.columns.values))

        if recent_date is None or (file_date > recent_date):
            xlsx_dict = sheet.to_dict(orient="records")
            recent_date = file_date
        else:
            LOGGER.info("Already up to date")
            return

        row_names = [
            "ticker",
            "exchange",
            "name",
            "sector",
            "osshares",
            "dateoflisting",
            "listingtype",
            "volume",
            "value",
        ]

        all_excel_names = tuple(xlsx_dict[0].keys())
        base_wanted_excel_names = [
            "Root Ticker",
            "Exchange",
            "Name",
            "Sector",
            "O/S",
            "Date of TSX Listing",
            "Listing Type",
            "Volume YTD",
            "Value (C$)",
        ]
        wanted_excel_names = []
        for bxn in base_wanted_excel_names:
            for xn in all_excel_names:
                if xn.startswith(bxn):
                    wanted_excel_names.append(xn)
                    break

        assert (len(base_wanted_excel_names) == len(wanted_excel_names) ==
                len(row_names))

        value_dics = []
        for row in xlsx_dict:
            value_dic = {"updatedate": recent_date}
            for excel_name, row_name in zip(wanted_excel_names, row_names):
                val = row[excel_name]
                if row_name == "dateoflisting":
                    val = datetime.strptime(str(val),
                                            "%Y%m%d")  # assume YYYYMMDD
                if val == '':
                    val = None
                value_dic[row_name] = val
            value_dics.append(value_dic)

        self.session.execute(insert(Listings).values(value_dics))
        self.session.commit()

    def get_historic_events(self):
        """
        Gets all the historical events from yahoo, updating only the new entries
        based on the date of the last fetch.
        """
        exchange = "TSX"
        listings = self.session.query(
            Listings.ticker,
            Listings.dateoflisting).filter(Listings.exchange == exchange)

        dict_fields = ["index", "action", "value"]
        fields = ["exchange", "ticker", "date", "action", "value"]
        total_listings = listings.count()

        for counter, (ticker, listdate) in enumerate(listings):
            lastdate, = self.session.query(func.max(
                EventHistory.updatedate)).filter(
                    EventHistory.exchange == exchange,
                    EventHistory.ticker == ticker).first()

            startdate = listdate if lastdate is None else lastdate + timedelta(
                days=1)

            rows = []
            if startdate < self.today:
                yahoo_ticker = ticker + ".TO"

                dividend_dict = self.ticker_history(startdate,
                                                    self.today,
                                                    yahoo_ticker,
                                                    info='dividend')
                split_dict = self.ticker_history(startdate,
                                                 self.today,
                                                 yahoo_ticker,
                                                 info='split')
                rows = []
                for row in dividend_dict:
                    rows.append([
                        exchange, ticker, row["date"], "DIVIDEND",
                        row["dividends"], self.today
                    ])
                for row in split_dict:
                    rows.append([
                        exchange, ticker, row["date"], "SPLIT",
                        row["stock_splits"], self.today
                    ])

            if rows:
                LOGGER.info("{}/{} Inserting {} from {} to {}".format(
                    counter + 1, total_listings, ticker, startdate,
                    self.today))
                stmt = insert(EventHistory).values(
                    rows).on_conflict_do_nothing(
                        constraint='event_history_pkey')
                self.session.execute(stmt)
                self.session.commit()
            else:
                LOGGER.info("{}/{} Skipping ticker {}".format(
                    counter + 1, total_listings, ticker))

    def get_historic_prices(self):
        """
        Gets all the historical prices from yahoo, updating only the new entries
        based on the date of the last fetch.
        """

        exchange = "TSX"

        listings = list(
            self.session.query(
                Listings.ticker,
                Listings.dateoflisting).filter(Listings.exchange == exchange))
        total_listings = len(listings)

        for counter, (ticker, listdate) in enumerate(listings):
            lastdate, = self.session.query(func.max(PriceHistory.date)).filter(
                PriceHistory.exchange == exchange,
                PriceHistory.ticker == ticker).first()

            startdate = listdate if lastdate is None else lastdate + timedelta(
                days=1)

            his_dict = []
            if startdate < self.today:
                yahoo_ticker = ticker + ".TO"
                start_dic = {"exchange": exchange, "ticker": ticker}
                his_dict = self.ticker_history(startdate,
                                               self.today,
                                               yahoo_ticker,
                                               info="quote",
                                               start_dic=start_dic)

            if his_dict:
                LOGGER.info("{}/{} Inserting {} from {} to {}".format(
                    counter, total_listings, ticker, startdate, self.today))

                for d in his_dict:
                    stmt = insert(PriceHistory).values(
                        d).on_conflict_do_update(
                            constraint='price_history_pkey', set_=d)
                    self.session.execute(stmt)

                self.session.commit()
            else:
                LOGGER.info("{}/{} Skipping ticker {}".format(
                    counter, total_listings, ticker))

    def cleanse_str(self, raw_str):
        return raw_str.replace('\n', ' ').replace("  ", ' ')

    def find_date_in_list(self, strings):
        """
        Returns the first date that occurs in a list of string
        or the current date if none are detected.
        """
        cur_date = self.today  # default = cur. date
        for s in strings:
            try:
                temp_date = dparser.parse(s, fuzzy=True).date()
            except ValueError:
                continue

            if cur_date != temp_date:
                cur_date = temp_date
                break
        return cur_date

    def convert_yahoo_element(self, element):
        converted = None
        try:
            converted = float(element)
        except ValueError:
            try:
                converted = datetime.strptime(element, "%Y-%m-%d")
            except ValueError:
                if element == 'null':
                    converted = None
                elif '/' in element:
                    try:
                        a, b = element.split('/')
                        converted = float(a) / float(b)
                    except ValueError:
                        LOGGER.info("Unable to convert {}".format(element))
                else:
                    LOGGER.info("Unable to convert {}".format(element))

        return converted

    def ticker_history(self, start, end, ticker, info='quote', start_dic={}):
        """
        Gets and returns the historic prices for a given ticker for between
        the time period provided. Inclusive.
        """

        start_str = start.strftime('%Y%m%d')
        end_str = end.strftime('%Y%m%d')

        # info = 'quote', 'dividend', 'split'
        try:
            data = yqd.load_yahoo_quote(ticker, start_str, end_str, info=info)
        except (HTTPError, URLError, gaierror) as e:
            LOGGER.info("Yahoo request failed. Blocked?")
            return []

        titles = tuple(t.replace(' ', '_').lower() for t in data[0].split(','))

        history = []
        for row in data[1:-1]:
            history_row = {k: v for k, v in start_dic.items()}
            iter_list = row.split(',')

            for element, title in zip(iter_list, titles):
                converted = self.convert_yahoo_element(element)
                history_row[title] = converted
            history.append(history_row)
        return history

    def clean_exit(self):
        self.session.close()
Exemplo n.º 9
0
 def __init__(self):
     self.y_to_db_map = {
         'Forward P/E': 'forward_pe',
         'Return on Equity': 'ro_equity',
         'Current Ratio': 'current_ratio',
         'Total Debt': 'total_debt',
         'Forward Annual Dividend Rate': 'forward_annual_dividend_rate',
         'Last Split Date': 'last_split_date',
         'Market Cap (intraday)': 'market_cap',
         'EBITDA': 'ebitda',
         'Shares Short': 'shares_short',
         '50-Day Moving Average': 'fifty_day_moving_avg',
         '52 Week High': 'fifty_two_week_high',
         'Quarterly Earnings Growth': 'q_earnings_growth',
         'Forward Annual Dividend Yield': 'forward_annual_dividend_yield',
         'Beta': 'beta',
         'Payout Ratio': 'payout_ratio',
         'Avg Vol (3 month)': 'avg_vol_3_month',
         'Enterprise Value': 'enterprise_value',
         '5 Year Average Dividend Yield': 'five_year_avg_dividend_yield',
         'Enterprise Value/Revenue': 'enterprise_value_revenue',
         'Trailing P/E': 'trailing_pe',
         'Total Cash': 'total_cash',
         'Operating Cash Flow': 'operating_cash_flow',
         'Price/Book': 'price_book',
         'Fiscal Year Ends': 'fiscal_year_ends',
         'Total Debt/Equity': 'total_debt_equity',
         'Dividend Date': 'dividend_date',
         'Most Recent Quarter': 'most_recent_q',
         'Operating Margin': 'operating_margin',
         'Ex-Dividend Date': 'exdividend_date',
         '% Held by Institutions': 'perc_held_by_institutions',
         'Trailing Annual Dividend Yield': 'trailing_annual_dividend_yield',
         '200-Day Moving Average': 'two_hundred_day_moving_avg',
         '52 Week Low': 'fifty_two_week_low',
         'Avg Vol (10 day)': 'avg_vol_10_day',
         'Last Split Factor (new per old)': 'last_split_factor',
         '% Held by Insiders': 'perc_held_by_insiders',
         'Revenue Per Share': 'revenue_per_share',
         'Short Ratio': 'short_ratio',
         'Shares Short (prior month)': 'shares_short_prior_month',
         'Short % of Float': 'short_perc_float',
         'Profit Margin': 'profit_margin',
         'Return on Assets': 'ro_assets',
         'Price/Sales': 'price_sales',
         'Gross Profit': 'gross_profit',
         'Book Value Per Share': 'book_value_per_share',
         'Levered Free Cash Flow': 'levered_free_cash_flow',
         'Trailing Annual Dividend Rate': 'trailing_annual_dividend_rate',
         'Diluted EPS': 'diluted_eps',
         'PEG Ratio (5 yr expected)': 'peg_ratio_5yr',
         'Shares Outstanding': 'shares_outstanding',
         'Revenue': 'revenue',
         'Float': 'float',
         'Net Income Avi to Common': 'net_income_avi_common',
         'Enterprise Value/EBITDA': 'enterprise_value_ebitda',
         '52-Week Change': 'fifty_two_week_change',
         'Quarterly Revenue Growth': 'q_revenue_growth',
         'Total Cash Per Share': 'total_cash_ps'
     }
     self.convert_dict = {'K': 10**3, 'M': 10**6, 'B': 10**9, 'T': 10**12}
     self.condensed_pat = re.compile("([+-]?\d*[\.]?\d+)([kmbtKMBT])$")
     self.float_pat = re.compile("[+-]?\d*[\.]\d+$")
     self.parenthese_pat = re.compile(" *\(([^)]*)\)")
     self.date_line_pat = re.compile("\(as of (\d+.*\d+)\)")
     self.url_ticker_pat = re.compile(".*/quote/(.*)\.(.*)/key-statistics")
     self.keywords = set({"mrq", "ttm", "yoy", "lfy", "fye"})
     self.today = datetime.today().date()
     self.default_fye = datetime(self.today.year, 12, 31)
     self.session = Session()
Exemplo n.º 10
0
class YahooScraper():
    def __init__(self):
        self.y_to_db_map = {
            'Forward P/E': 'forward_pe',
            'Return on Equity': 'ro_equity',
            'Current Ratio': 'current_ratio',
            'Total Debt': 'total_debt',
            'Forward Annual Dividend Rate': 'forward_annual_dividend_rate',
            'Last Split Date': 'last_split_date',
            'Market Cap (intraday)': 'market_cap',
            'EBITDA': 'ebitda',
            'Shares Short': 'shares_short',
            '50-Day Moving Average': 'fifty_day_moving_avg',
            '52 Week High': 'fifty_two_week_high',
            'Quarterly Earnings Growth': 'q_earnings_growth',
            'Forward Annual Dividend Yield': 'forward_annual_dividend_yield',
            'Beta': 'beta',
            'Payout Ratio': 'payout_ratio',
            'Avg Vol (3 month)': 'avg_vol_3_month',
            'Enterprise Value': 'enterprise_value',
            '5 Year Average Dividend Yield': 'five_year_avg_dividend_yield',
            'Enterprise Value/Revenue': 'enterprise_value_revenue',
            'Trailing P/E': 'trailing_pe',
            'Total Cash': 'total_cash',
            'Operating Cash Flow': 'operating_cash_flow',
            'Price/Book': 'price_book',
            'Fiscal Year Ends': 'fiscal_year_ends',
            'Total Debt/Equity': 'total_debt_equity',
            'Dividend Date': 'dividend_date',
            'Most Recent Quarter': 'most_recent_q',
            'Operating Margin': 'operating_margin',
            'Ex-Dividend Date': 'exdividend_date',
            '% Held by Institutions': 'perc_held_by_institutions',
            'Trailing Annual Dividend Yield': 'trailing_annual_dividend_yield',
            '200-Day Moving Average': 'two_hundred_day_moving_avg',
            '52 Week Low': 'fifty_two_week_low',
            'Avg Vol (10 day)': 'avg_vol_10_day',
            'Last Split Factor (new per old)': 'last_split_factor',
            '% Held by Insiders': 'perc_held_by_insiders',
            'Revenue Per Share': 'revenue_per_share',
            'Short Ratio': 'short_ratio',
            'Shares Short (prior month)': 'shares_short_prior_month',
            'Short % of Float': 'short_perc_float',
            'Profit Margin': 'profit_margin',
            'Return on Assets': 'ro_assets',
            'Price/Sales': 'price_sales',
            'Gross Profit': 'gross_profit',
            'Book Value Per Share': 'book_value_per_share',
            'Levered Free Cash Flow': 'levered_free_cash_flow',
            'Trailing Annual Dividend Rate': 'trailing_annual_dividend_rate',
            'Diluted EPS': 'diluted_eps',
            'PEG Ratio (5 yr expected)': 'peg_ratio_5yr',
            'Shares Outstanding': 'shares_outstanding',
            'Revenue': 'revenue',
            'Float': 'float',
            'Net Income Avi to Common': 'net_income_avi_common',
            'Enterprise Value/EBITDA': 'enterprise_value_ebitda',
            '52-Week Change': 'fifty_two_week_change',
            'Quarterly Revenue Growth': 'q_revenue_growth',
            'Total Cash Per Share': 'total_cash_ps'
        }
        self.convert_dict = {'K': 10**3, 'M': 10**6, 'B': 10**9, 'T': 10**12}
        self.condensed_pat = re.compile("([+-]?\d*[\.]?\d+)([kmbtKMBT])$")
        self.float_pat = re.compile("[+-]?\d*[\.]\d+$")
        self.parenthese_pat = re.compile(" *\(([^)]*)\)")
        self.date_line_pat = re.compile("\(as of (\d+.*\d+)\)")
        self.url_ticker_pat = re.compile(".*/quote/(.*)\.(.*)/key-statistics")
        self.keywords = set({"mrq", "ttm", "yoy", "lfy", "fye"})
        self.today = datetime.today().date()
        self.default_fye = datetime(self.today.year, 12, 31)
        self.session = Session()

    def s2n(self, string):
        reg = self.condensed_pat.search(string)
        return int(
            float(reg.group(1)) * self.convert_dict[reg.group(2).upper()])

    @staticmethod
    def s2p(string):
        return float(string.strip('%'))

    @staticmethod
    def s2r(string):
        split = string.split(':')
        return float(split[0]) / float(split[1])

    def parse_numeric(self, string):
        try:
            if '%' in string:
                return self.s2p(string)
            elif self.condensed_pat.match(string) is not None:
                return self.s2n(string)
            elif self.float_pat.match(string) is not None:
                return float(string)
            elif string.isdigit():
                return int(string)
            elif ':' in string:
                return self.s2r(string)
            else:
                return dp.parse(string).date().isoformat()
        except ValueError:
            return None

    def cleanse_str(self, string):
        return self.parenthese_pat.sub('', string.replace(',', '')).strip(':')

    def dic_parse(self, session, url, html):
        def innerHtml(ele):
            return ele.decode_contents(formatter="html")

        soup = BeautifulSoup(html, "lxml")
        ticker = self.url_ticker_pat.search(url).group(1)
        exchange = "TSX"

        on_yahoo = soup.find('section', attrs={'data-test': 'lookup-page'
                                               }) is None
        session.query(Listings).filter(Listings.exchange == exchange,
                                       Listings.ticker == ticker).update(
                                           {Listings.onyahoo: on_yahoo})

        if not on_yahoo:  # if quote not found, exit
            LOGGER.error("Failed to find quote for {} skipping".format(url))
            return

        div_test = soup.find('section', attrs={'data-test': 'qsp-statistics'})
        if div_test is None:
            LOGGER.error("Unknown error for {} skipping".format(url))
            return

        db_dic = {}
        for table in div_test.find_all('table'):
            for row in table.find_all('tr'):
                td_list = row.find_all('td')
                title = innerHtml(td_list[0].find('span'))
                val = innerHtml(td_list[1]) if td_list[1].find(
                    'span') is None else innerHtml(td_list[1].find('span'))
                if title in self.y_to_db_map:
                    db_dic[self.y_to_db_map[title]] = self.parse_numeric(val)

        if db_dic:
            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange
            exists = session.query(KeyStatistics).filter_by(
                **db_dic).scalar() is not None

            if exists:
                LOGGER.info("Skipping {} due to prior existence".format(url))
            else:
                db_dic["update_date"] = self.today

                stmt = insert(KeyStatistics).values(
                    db_dic).on_conflict_do_nothing(
                        constraint='key_statistics_pkey', )
                session.execute(stmt)
                session.commit()

                LOGGER.info("Done parsing {}".format(url))
        else:
            LOGGER.info("Skipping {}".format(url))

    def fetch_all(self, exchange):
        q = self.session.query(Listings).filter(
            Listings.exchange == exchange,
            or_(Listings.onyahoo == True, Listings.onyahoo is None))

        extension = '.TO'
        urls = [
            "https://ca.finance.yahoo.com/quote/{}{}/key-statistics".format(
                l.ticker, extension) for l in q
        ]

        xpath_hooks = [
            "//section[@data-test='qsp-statistics']",
            "//section[@data-test='lookup-page']"
        ]

        LOGGER.info("Fetching/Updating {} urls.".format(len(urls)))

        jsps = JSPageScraper(self.dic_parse, xpath_hooks, "key_statistics")
        jsps.go(urls)

    def clean_exit(self):
        self.session.close()
Exemplo n.º 11
0
class YahooApiScraper():
    def __init__(self):
        # This is a reverse engineering of the Yahoo Finance REST API
        # Information off: http://www.jarloo.com/yahoo_finance/
        self.y_to_db_map = {'n': 'name', 'y': 'dividend_yield', 'd': 'dividend_ps',
                            'r': 'pe', 'r1': 'dividend_pay_date', 'q': 'ex_dividend_date',
                            'o': 'open', 'c1': 'change', 'p2': 'perc_change', 'd1': 'last_trade_date',
                            'd2': 'trade_date', 'c3': 'commission', 'g': 'day_low', 'h': 'day_high',
                            'p': 'previous_close', 't8': 'year_target', 'm5': 'change_mv_avg_200',
                            'm6': 'perc_change_mv_avg_200', 'm7': 'change_mv_avg_50', 'm8': 'perc_change_mv_avg_50',
                            'm3': 'mv_avg_50', 'm4': 'mv_avg_200', 'w1': 'day_value_change',
                            'g1': 'holding_gain_perc', 'g3': 'annualized_gain', 'g4': 'holdings_gain',
                            'k': 'high_52_week', 'j': 'low_52_week', 'j5': 'change_52_week_low',
                            'k4': 'change_52_week_high', 'j6': 'perc_change_52_week_low',
                            'k5': 'perc_change_52_week_high', 'j1': 'market_cap',
                            'f6': 'float_shares', 'x': 'stock_exchange', 's1': 'shares_owned',
                            'j2': 'shares_outstanding', 'n4': 'notes', 'i': 'more_info',
                            'v': 'volume', 'a2': 'avg_daily_volume', 'e': 'eps', 'e7': 'eps_year_estimate',
                            'e8': 'eps_next_year_estimate', 'e9': 'eps_next_q_estimate', 'b4': 'book',
                            'j4': 'ebitda', 'p5': 'price_sale', 'p6': 'price_book', 'r': 'pe', 'r5': 'peg',
                            'r6': 'price_eps_estimate_year', 'r7': 'price_eps_estimate_next_year', 's7': 'short_ratio',
                            's6': 'revenue', 'v1': 'holdings_val', 'l2': 'high_limit', 'l3': 'low_limit',
                            'a': 'ask', 'b': 'bid'}
        self.convert_dict = {'K': 10 ** 3, 'M': 10 ** 6, 'B': 10 ** 9, 'T': 10 ** 12}
        self.condensed_pat = re.compile("([+-]?\d*\.?\d+)([kmbtKMBT])")
        self.url_flags = tuple(self.y_to_db_map.keys())
        self.url_str_flags = "".join(self.url_flags)
        self.db_entries = tuple(self.y_to_db_map.values())
        self.float_pat = re.compile("[+-]?(\d*[\.])?\d+$")
        self.today = datetime.today().date()
        self.base_url = "http://finance.yahoo.com/d/quotes.csv"

        # Sometimes websites are friendlier to iOS devices :)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25"
        }
        self.session = Session()

    def handle_csv_string(self, string):
        if string == 'N/A':
            return None
        elif '%' in string:
            return float(string.strip('%'))
        elif string.isdigit():
            return int(string)
        elif self.condensed_pat.match(string) is not None:
            reg = self.condensed_pat.search(string)
            return int(float(reg.group(1)) * self.convert_dict[reg.group(2).upper()])
        elif self.float_pat.match(string) is not None:
            return float(string)
        else:
            try:
                return dp.parse(string).date().isoformat()
            except ValueError:
                return string

    def chunks(self, l, n):
        return [l[i:i + n] for i in range(0, len(l), n)]

    def create_url(self, tickers):
        ticker_str = "+".join(tickers)
        url = "{}?s={}&f={}".format(self.base_url, ticker_str, self.url_str_flags)
        return url

    def handle_url(self, tickers, url, exchange):
        """
        Fetches the url and inserts the data into the appropriate cols in the DB.
        """
        LOGGER.info("Starting to add url: {} ...".format(url))

        req =  urllib.request.Request(url, headers=self.headers)
        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        db_list = []
        for row, ticker in zip(csv_r, tickers):
            assert(len(row) == len(self.url_flags))

            db_dic = {db_col: self.handle_csv_string(cell) for cell, db_col in zip(row, self.db_entries)}

            onyahoo = any(v is not None for v in db_dic.values())

            self.session.query(Listings).filter(Listings.exchange == exchange,
                                                Listings.ticker == ticker
            ).update({Listings.onyahoo: onyahoo})

            if not onyahoo: # not found, skip
                LOGGER.error("Failed to find quote for {} skipping".format(ticker))
                continue

            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange

            exists = self.session.query(YahooKeyStatistics).filter_by(**db_dic).scalar() is not None
            if exists:
                LOGGER.info("Skipping {} due to prior existence".format(ticker))
                continue

            db_dic["update_date"] = self.today

            # Annoyingly enough, sqlalchemy doesn't allow PostgreSQL bulk inserts
            # when checking constraints, RIP performance
            stmt = insert(YahooKeyStatistics).values(db_dic).on_conflict_do_nothing(
                constraint = 'yahoo_key_statistics_pkey',
            )
            self.session.execute(stmt)
        self.session.commit()

        LOGGER.info("Done url.")

    def fetch_all(self, exchange):
        extension = '.TO'
        tickers = tuple(x.ticker + extension for x in self.session.query(Listings.ticker).filter(Listings.exchange == exchange, or_(Listings.onyahoo == True, Listings.onyahoo is None)))

        ticker_groups = self.chunks(tickers, 200)

        LOGGER.info("Fetching/Updating {} urls.".format(len(ticker_groups)))

        for ticker_group in ticker_groups:
            url = self.create_url(ticker_group)
            self.handle_url(ticker_group, url, exchange)
            sleep(1) # limit requests to 1/s

    def clean_exit(self):
        self.session.close()