示例#1
0
文件: core.py 项目: raufer/fortune
def crawl_ticker_symbols(driver: WebDriver = None, url: str = None, soup=None) -> List[Tuple[str, str]]:
    """
    Extracts all of the ticker symbols that are involved
    Returns a list of strings [(exchange, symbol)]
    """
    logger.info(f"Extracting ticker symbols in article scope")

    if soup is None:
        html = api.get(driver, url, headers=make_headers('wsj'), wait_for=2)
        soup = BeautifulSoup(html)

    stocks = soup.find_all('a', class_=lambda v: v and v.startswith("media-object-chiclet"), recursive=True)
    logger.debug(f"'{len(stocks)}' stocks")

    urls = list(set([urljoin(base, s['href']) for s in stocks]))
    pages = [api.get(driver, url, headers=make_headers(source='wsj')) for url in urls]

    sections = [BeautifulSoup(html).find('div', class_='cr_quotesHeader') for html in pages]

    symbols = [
        (
            re.search(r'.+:\s?([A-Z0-9a-z]+)\)?', section.find('span', class_='exchangeName', recursive=True).get_text()).group(1),
            section.find('span', class_='tickerName', recursive=True).get_text().strip()
        )
        for section in sections
    ]

    logger.info(f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'")

    return symbols
示例#2
0
文件: core.py 项目: raufer/fortune
def crawl_ticker_symbols(driver: WebDriver = None,
                         url: str = None,
                         soup=None) -> List[Tuple[str, str]]:
    """
    Extracts all of the ticker symbols that are involved
    Returns a list of strings [(exchange, symbol)]
    """
    logger.info(f"Extracting ticker symbols in article scope")

    if soup is None:
        html = api.get(driver,
                       url,
                       headers=make_headers('bloomberg'),
                       wait_for=2)
        soup = BeautifulSoup(html)

    section = soup.find('section',
                        class_=lambda v: v and 'main-column' in v,
                        recursive=True)

    references = section.find_all('a', recursive=True)
    stocks = [ref for ref in references if '/quote' in ref.get('href')]
    stocks = list(
        set([
            urljoin(base, s['href']) for s in stocks
            if not s['href'].split('/')[-1][0].isdigit()
        ]))

    logger.debug(f"'{len(stocks)}' stocks")

    pages = [api.get(driver, url) for url in stocks]

    for p in pages:
        print(p)

    sections = [BeautifulSoup(result) for result in pages]
    sections = [
        section.find('meta', property='og:title')['content']
        for section in sections
    ]

    matches = [
        re.search(r'([A-Z0-9a-z]+):(.+) Stock Quote', s) for s in sections
    ]

    symbols = [(match.group(2), match.group(1)) for match in matches]

    translate = {'New York': 'NYSE', 'NASDAQ GS': 'NASDAQ'}

    symbols = [(translate[exchange], ticker) for exchange, ticker in symbols]

    logger.info(
        f"Symbols found: '{[(s[0] + str('::') + s[1]) for s in symbols]}'")

    return symbols
示例#3
0
文件: parsing.py 项目: raufer/fortune
def parse_article(driver: WebDriver, url: str):
    """
    Given an article, parse its content into a
    representation that preserves the structure

    * Grab the HTML page
    * Crawl its contents
    * Tag the text with the different hierarchical components
    * Parse the resulting output into a graph
    * Enrich with metadata:
        - author information: name, url, etc;
        - document title;
        - publishing timestamp;
        - other metadata;
    """
    logger.info(f"Parsing article '{url}'")

    html = api.get(driver, url,  make_headers(source='seekingalpha'), wait_for=2)
    soup = BeautifulSoup(html)
    logger.debug(f"Soup length '{len(soup)}'")

    hierarchy = ['Article', 'Section', 'Paragraph']

    descriptor = {
        'components': hierarchy,
        'patterns': hierarchy
    }

    text = crawl_article(soup)
    logger.info(f"Text crawled. Number of lines '{len(text)}'")

    logger.info(f"Creating a graph")
    doc = parse_iterable(text, descriptor)
    doc = doc.to_dict()

    doc['url'] = url
    doc['title'] = crawl_title(soup)
    doc['author'] = crawl_author(driver, soup)
    doc['timestamp'] = crawl_timestamp(soup)
    doc['symbols'] = crawl_ticker_symbols(soup)

    # TODO: meta, e.g likes and comments are still with problems
    doc['meta'] = crawl_metadata(soup)

    return doc
示例#4
0
def crawl_author(driver: WebDriver, soup):
    logger.debug(f"Extracting author information")

    author_tag = soup.find('div', class_='media hidden-print').find(
        'div', class_='info').find('div', class_='top')

    author_url = author_tag.find('a')['href']
    logger.debug(f"Author URL: '{author_url}'")

    author_name = author_tag.find('span', class_='name').get_text()
    logger.debug(f"Author Name: '{author_name}'")

    logger.debug(f"Getting author specific page '{author_url}'")

    html = api.get(driver,
                   author_url,
                   make_headers('seekingalpha'),
                   wait_for=2)
    soup = BeautifulSoup(html)

    followers = soup.find('li',
                          class_=['followers',
                                  'followers tab ']).find('i').get_text()
    logger.debug(f"Number of followers '{followers}'")

    articles = soup.find('li', class_='articles').find(
        'i', class_='profile-top-nav-count').get_text()
    logger.debug(f"Number of articles '{articles}'")

    result = {
        'name': author_name,
        'url': author_url,
        'followers': followers,
        'articles': articles
    }

    logger.debug(f"Author Information: '{result}'")
    return result