def crawl_ticker_symbols(driver: WebDriver = None, url: str = None, soup=None) -> List[Tuple[str, str]]: """ Extracts all of the ticker symbols that are involved Returns a list of strings [(exchange, symbol)] """ logger.info(f"Extracting ticker symbols in article scope") if soup is None: html = api.get(driver, url, headers=make_headers('wsj'), wait_for=2) soup = BeautifulSoup(html) stocks = soup.find_all('a', class_=lambda v: v and v.startswith("media-object-chiclet"), recursive=True) logger.debug(f"'{len(stocks)}' stocks") urls = list(set([urljoin(base, s['href']) for s in stocks])) pages = [api.get(driver, url, headers=make_headers(source='wsj')) for url in urls] sections = [BeautifulSoup(html).find('div', class_='cr_quotesHeader') for html in pages] symbols = [ ( re.search(r'.+:\s?([A-Z0-9a-z]+)\)?', section.find('span', class_='exchangeName', recursive=True).get_text()).group(1), section.find('span', class_='tickerName', recursive=True).get_text().strip() ) for section in sections ] logger.info(f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'") return symbols
def crawl_ticker_symbols(driver: WebDriver = None, url: str = None, soup=None) -> List[Tuple[str, str]]: """ Extracts all of the ticker symbols that are involved Returns a list of strings [(exchange, symbol)] """ logger.info(f"Extracting ticker symbols in article scope") if soup is None: html = api.get(driver, url, headers=make_headers('bloomberg'), wait_for=2) soup = BeautifulSoup(html) section = soup.find('section', class_=lambda v: v and 'main-column' in v, recursive=True) references = section.find_all('a', recursive=True) stocks = [ref for ref in references if '/quote' in ref.get('href')] stocks = list( set([ urljoin(base, s['href']) for s in stocks if not s['href'].split('/')[-1][0].isdigit() ])) logger.debug(f"'{len(stocks)}' stocks") pages = [api.get(driver, url) for url in stocks] for p in pages: print(p) sections = [BeautifulSoup(result) for result in pages] sections = [ section.find('meta', property='og:title')['content'] for section in sections ] matches = [ re.search(r'([A-Z0-9a-z]+):(.+) Stock Quote', s) for s in sections ] symbols = [(match.group(2), match.group(1)) for match in matches] translate = {'New York': 'NYSE', 'NASDAQ GS': 'NASDAQ'} symbols = [(translate[exchange], ticker) for exchange, ticker in symbols] logger.info( f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'") return symbols
def parse_article(driver: WebDriver, url: str): """ Given an article, parse its content into a representation that preserves the structure * Grab the HTML page * Crawl its contents * Tag the text with the different hierarchical components * Parse the resulting output into a graph * Enrich with metadata: - author information: name, url, etc; - document title; - publishing timestamp; - other metadata; """ logger.info(f"Parsing article '{url}'") html = api.get(driver, url, make_headers(source='seekingalpha'), wait_for=2) soup = BeautifulSoup(html) logger.debug(f"Soup length '{len(soup)}'") hierarchy = ['Article', 'Section', 'Paragraph'] descriptor = { 'components': hierarchy, 'patterns': hierarchy } text = crawl_article(soup) logger.info(f"Text crawled. Number of lines '{len(text)}'") logger.info(f"Creating a graph") doc = parse_iterable(text, descriptor) doc = doc.to_dict() doc['url'] = url doc['title'] = crawl_title(soup) doc['author'] = crawl_author(driver, soup) doc['timestamp'] = crawl_timestamp(soup) doc['symbols'] = crawl_ticker_symbols(soup) # TODO: meta, e.g likes and comments are still with problems doc['meta'] = crawl_metadata(soup) return doc
def crawl_author(driver: WebDriver, soup): logger.debug(f"Extracting author information") author_tag = soup.find('div', class_='media hidden-print').find( 'div', class_='info').find('div', class_='top') author_url = author_tag.find('a')['href'] logger.debug(f"Author URL: '{author_url}'") author_name = author_tag.find('span', class_='name').get_text() logger.debug(f"Author Name: '{author_name}'") logger.debug(f"Getting author specific page '{author_url}'") html = api.get(driver, author_url, make_headers('seekingalpha'), wait_for=2) soup = BeautifulSoup(html) followers = soup.find('li', class_=['followers', 'followers tab ']).find('i').get_text() logger.debug(f"Number of followers '{followers}'") articles = soup.find('li', class_='articles').find( 'i', class_='profile-top-nav-count').get_text() logger.debug(f"Number of articles '{articles}'") result = { 'name': author_name, 'url': author_url, 'followers': followers, 'articles': articles } logger.debug(f"Author Information: '{result}'") return result