示例#1
0
def ping_gnw_rss_news_feed(url):
    """
    Pings a Global Newswire rss feed and returns all articles present
    :param url: url of RSS feed
    :return: list of article objects
    """
    feed = feedparser.parse(url)
    output = []
    for entry in feed.entries:
        ticker_object_list = False
        for i, tag in enumerate(entry.tags):
            if get_ticker_objects_from_description(entry.tags[i].term):
                ticker_object_list = get_ticker_objects_from_description(
                    entry.tags[i].term)

        if ticker_object_list and entry.language == 'en':
            date_time_utc = entry.published
            date_time_utc_object = datetime.datetime.strptime(date_time_utc, '%a, %d %b %Y %H:%M %Z') \
                .replace(tzinfo=timezone('UTC'))
            date_time_eastern_object = date_time_utc_object.astimezone(
                timezone('US/Eastern'))

            description_html = entry.description
            description = re.sub('<[^<]+?>', '', description_html)
            title = entry.title
            link = entry.link.split('?')[0]
            news_article = NewsArticle(date_time_eastern_object, title,
                                       ticker_object_list, description, link,
                                       'Globe Newswire')

            output.append(news_article)

    return output
示例#2
0
def ping_bus_wire_rss_news_feed(url):
    """
    Pull all articles from the RSS feed that have a stock ticker in the article
    :param url: url of RSS Feed
    :return: List of article objects
    """
    feed = feedparser.parse(url)
    output = []
    for entry in feed.entries:
        description = entry.description
        ticker_object_list = get_ticker_objects_from_description(description)

        if ticker_object_list:
            date_time_utc = entry.published + 'C'
            date_time_utc_object = datetime.datetime.strptime(date_time_utc, '%a, %d %b %Y %H:%M:%S %Z') \
                .replace(tzinfo=timezone('UTC'))
            date_time_eastern_object = date_time_utc_object.astimezone(
                timezone('US/Eastern'))
            date_time_pt_object = date_time_utc_object.astimezone(
                timezone('US/Pacific'))

            title = entry.title
            link = entry.link.split('?')[0]

            if is_english_story(link):
                news_article = NewsArticle(date_time_eastern_object, title,
                                           ticker_object_list, description,
                                           link, 'Business Wire')
                output.append(news_article)

    return output
示例#3
0
def find_story_from_ticker_date(ticker,
                                date_begin_string,
                                browser,
                                exchange='',
                                date_end_string=''):
    """
    Searches GlobeNewswire for by ticker and date.
    :param ticker: Company ticker
    :param date_begin_string: Begin date for search. If there is no end date, this is also the end date.
    :param browser: Browser object
    :param exchange: Exchange on which the ticker is traded
    :param date_end_string: (optional) End date of search
    :return: List of article objects
    """
    # Turn date_string into object for comparison
    date_start_object = normalize_date_return_object(date_begin_string)
    date_start_str = date_start_object.strftime('%Y-%m-%d')

    if date_end_string != '':
        date_end_object = normalize_date_return_object(date_end_string)
        date_end_str = date_end_object.strftime('%Y-%m-%d')
    else:
        date_end_str = date_start_str

    url_page = 1
    all_stories = []
    next_page_button = True
    # While the there is a "Next Page" button on the page, keep paginating
    while next_page_button:
        keyword = ticker
        if exchange:
            keyword = keyword + "," + exchange

        url = 'https://www.globenewswire.com/search/lang/en/exchange/nyse,nasdaq/date/[' + date_start_str + \
              '%2520TO%2520' + date_end_str + ']/keyword/' + keyword + \
              '?page=' + str(url_page)

        next_page_button = check_for_next_page_button(browser)
        search_page_details = get_stories_from_search_page(url, browser)

        for story in search_page_details:
            # TODO - add the ability to also ensure the exchange is the same
            tickers_in_story = get_ticker_objects_from_description(
                story.description)
            tickers_only = [i.ticker for i in tickers_in_story]
            if ticker in tickers_only:
                all_stories.append(story)

        url_page += 1
    return all_stories
示例#4
0
def get_stories_from_search_page(url, browser):
    """
    Retrieve all articles that exist on a search page
    :param url: URL of search
    :param browser: Browser object
    :return: List of article objects
    """
    browser.get(url)
    timeout = 20

    try:
        if check_for_no_stories(browser):
            print(check_for_no_stories())
            return None
        # Wait until the bottom image element loads before reading in data.
        WebDriverWait(browser, timeout). \
            until(EC.visibility_of_element_located((By.XPATH, '//*[@id="bw-group-all"]/div/div/div[3]/'
                                                              'section/ul/li[last()]/p')))
        # Retrieve dates, title, desciption, url from each story
        date_elems = browser.find_elements_by_xpath(
            '//*[@id="bw-group-all"]/div/div/div[3]/section/'
            'ul/li[*]/div[1]/time')
        title_elems = browser.find_elements_by_xpath(
            '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/h3/a')
        heading_elems = browser.find_elements_by_xpath(
            '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/p')
        url_elems = browser.find_elements_by_xpath(
            '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/h3/a')

        # Take text from each object and put in lists
        date_text = [elem.text for elem in date_elems]
        title_text = [elem.text for elem in title_elems]
        heading_text = [elem.text for elem in heading_elems]
        urls = [elem.get_attribute('href') for elem in url_elems]

        output = []
        for i, n in enumerate(urls):
            ticker_object_list = get_ticker_objects_from_description(
                heading_text[i])
            if is_english_story(urls[i]) and ticker_object_list:
                date = normalize_date_return_object(date_text[i])
                article_object = NewsArticle(date[i], title_text[i],
                                             ticker_object_list[i],
                                             heading_text[i], urls[i],
                                             'BusinessWire')
                output.append(article_object)

        return output
    except TimeoutException:
        return []
示例#5
0
def pull_article_gnw(url):
    """
    Pull the article text from a GlobeNewsire url
    :param url: GlobeNewswire URL
    :return: Article text
    """
    try:
        page = urlopen(url)
        soup = BeautifulSoup(page, 'html.parser')

        all_page_text = soup.find('div', id='main-body-container')

        # Sometimes title is h2, sometimes it is h1
        title = soup.find('h2')
        if title is None:
            title = soup.find('h1')
        title = title.text

        date_time_str = soup.find('time')['datetime']
        date_time_utc_object = datetime.datetime.strptime(date_time_str, '%Y-%m-%dT%H:%M:%SZ').\
            replace(tzinfo=timezone('UTC'))
        date_time_eastern_object = date_time_utc_object.astimezone(
            timezone('US/Eastern'))
        p_elems_all = all_page_text.findAll('p')
        split_index = len(p_elems_all)

        for i, p in enumerate(p_elems_all):
            # Find <p> element that starts with "About" to split
            if re.match('^[ |\n]*about', p.text.lower()):
                split_index = i
                break

        p_elems_article = p_elems_all[:split_index]
        description = p_elems_article[0].text
        article_text = ' '.join([p.text for p in p_elems_article])
        tickers = get_ticker_objects_from_description(article_text)

        return {
            'article_object':
            NewsArticle(date_time_eastern_object, title, tickers, description,
                        url, 'Globe Newswire'),
            'article_text':
            article_text
        }
    except (AttributeError, HTTPError, TimeoutError, NoneType) as e:
        print(e)
        return None
示例#6
0
def get_stories_from_search_page(url, browser):
    """
    Returns all stories from the current search page
    :param url: url of search results
    :param browser: Browser parameter
    :return: List of article objects
    """
    browser.get(url)
    timeout = 20

    try:
        if check_for_no_stories(browser):
            return None

        # Wait until the bottom image element loads before reading in data.
        WebDriverWait(browser, timeout). \
            until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div/div[2]/span')))
        # Retrieve dates, title, desrciption, url from each story
        date_elems = browser.find_elements_by_xpath(
            '/html/body/div[1]/div[2]/div/div[*]/div/div[2]/div/span[1]')
        title_elems = browser.find_elements_by_xpath(
            '/html/body/div[1]/div[2]/div/div[*]/div/div[2]/a')
        heading_elems = browser.find_elements_by_xpath(
            '/html/body/div[1]/div[2]/div/div[*]/div/div[2]/span')
        url_elems = browser.find_elements_by_xpath(
            '/html/body/div[1]/div[2]/div/div[*]/div/div[2]/a')
        # Take text from each object and put in lists
        date_text = [elem.text for elem in date_elems]
        title_text = [elem.text for elem in title_elems]
        heading_text = [elem.text for elem in heading_elems]
        urls = [elem.get_attribute('href') for elem in url_elems]

        output = []
        for i, n in enumerate(urls):
            ticker_object_list = get_ticker_objects_from_description(
                heading_text[i])
            if ticker_object_list:
                date = normalize_date_return_object(date_text[i])
                article_object = NewsArticle(date[i], title_text[i],
                                             ticker_object_list[i],
                                             heading_text[i], urls[i],
                                             'Globe Newswire')
                output.append(article_object)

        return output
    except TimeoutException:
        return []
示例#7
0
def pull_article_bw(url):
    """
    Pull the article text from a Business Wire url
    :param url: Business Wire URL
    :return: Article text
    """
    try:
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'html.parser')
        title = soup.find('h1').text
        date_time_str = soup.find('time')['datetime']
        date_time_utc_object = datetime.datetime.strptime(date_time_str, '%Y-%m-%dT%H:%M:%SZ').\
            replace(tzinfo=timezone('UTC'))
        date_time_eastern_object = date_time_utc_object.astimezone(
            timezone('US/Eastern'))
        all_page_text = soup.find('div', itemprop='articleBody')

        p_elems_all = all_page_text.findAll('p')
        split_index = len(p_elems_all)

        for i, p in enumerate(p_elems_all):
            # Find <p> element that starts with "About" to split
            if re.match('^[ |\n]*about', p.text.lower()):
                split_index = i
                break

        p_elems_article = p_elems_all[:split_index]
        description = p_elems_article[0].text
        article_text = ' '.join([p.text for p in p_elems_article])
        tickers = get_ticker_objects_from_description(article_text)
        return {
            'article_object':
            NewsArticle(date_time_eastern_object, title, tickers, description,
                        url, 'Business Wire'),
            'article_text':
            article_text
        }
    except (AttributeError, HTTPError, TimeoutError):
        return None
示例#8
0
def filter_gnw_news_feed_with_nonsequential_keywords(url, keywords):
    """
    Filter articles using non-sequential keywords
    # NOTE: THIS IS UNTESTED
    :param url: url of RSS feed
    :param keywords: list of keywords
    :return: list of article objects
    """
    news_stories = ping_gnw_rss_news_feed(url)
    output = []
    for entry in news_stories:
        description = entry['description']
        ticker_object = get_ticker_objects_from_description(description)
        if ticker_object:
            for keyword in keywords:
                if keyword in description and keyword == keywords[-1]:
                    output.append(entry)
                else:
                    # If any of the keywords do not match, go to next news story
                    break

    return output
示例#9
0
def pull_daily_change_for_all_gnw_articles(csv_input, csv_output):
    """
    Pull daily stock change for all articles referenced in a csv
    :param csv_input: Name of input CSV
    :param csv_output: Name of output CSV
    :return: Nothing
    """
    header = [
        'date', 'title', 'description', 'percent_change', 'max_percent_change',
        'volume'
    ]
    output = []
    with open(csv_input, 'r') as csv_in:
        csv_reader = csv.reader(csv_in)
        header_throwaway = next(csv_reader)
        with open(csv_output, 'w') as csv_out:
            csv_writer = csv.writer(csv_out)
            csv_writer.writerow(header)

            for row in csv_reader:
                [date, title, description] = row
                ticker_objects = get_ticker_objects_from_description(
                    description)
                if ticker_objects:
                    date_str = convert_text_date_for_api(date)
                    for ticker_object in ticker_objects:
                        ticker = ticker_object.ticker
                        stock_day_data = get_data_ticker_date_iex(
                            ticker, date_str)
                        if stock_day_data:
                            volume = stock_day_data['volume']
                            percent_change = stock_day_data['percent_change']
                            max_percent_change = stock_day_data[
                                'max_percent_change']
                            row.extend(
                                [percent_change, max_percent_change, volume])
                            csv_writer.writerow(row)
def pull_daily_change_for_all_bus_wire_articles(csv_input, csv_output):
    """
    Pull stock market data for all ticker/date combos in a file and return a file with that data
    :param csv_input: Input csv name
    :param csv_output: Output csv name
    :return: Nothing
    """
    header = [
        'date', 'title', 'description', 'percent_change', 'max_percent_change',
        'volume'
    ]
    output = []
    with open(csv_input, 'r') as csv_in:
        csv_reader = csv.reader(csv_in)
        header_throwaway = next(csv_reader)
        with open(csv_output, 'w') as csv_out:
            csv_writer = csv.writer(csv_out)
            csv_writer.writerow(header)

            for row in csv_reader:
                [date, title, description] = row
                ticker_objects = get_ticker_objects_from_description(
                    description)
                if ticker_objects:
                    date_str = convert_text_date_for_api(date)
                    for ticker_object in ticker_objects:
                        ticker = ticker_object.ticker
                        stock_day_data = get_data_ticker_date_iex(
                            ticker, date_str)
                        if stock_day_data:
                            volume = stock_day_data['volume']
                            percent_change = stock_day_data['percent_change']
                            max_percent_change = stock_day_data[
                                'max_percent_change']
                            row.extend(
                                [percent_change, max_percent_change, volume])
                            csv_writer.writerow(row)
示例#11
0
def get_stories_from_search_page(url, source, browser):
    """
    Returns all stories from the current search page
    :param url: url of search results
    :param browser: Browser parameter
    :return: List of article objects
    """
    browser.get(url)
    timeout = 20
    try:
        # If the source is Globe Newswire, use one xpath to find elements
        if check_for_no_stories(source, browser):
            return None

        if source == 'gnw':
            # Wait until the bottom image element loads before reading in data.
            WebDriverWait(browser, timeout). \
                until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div[*]/div[1]/div[2]/span')))
            # Retrieve dates, title, desrciption, url from each story
            date_elems = browser.find_elements_by_xpath(
                '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/div/span[1]'
            )
            '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/div/span[1]'
            title_elems = browser.find_elements_by_xpath(
                '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/a')
            heading_elems = browser.find_elements_by_xpath(
                '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/span')
            url_elems = browser.find_elements_by_xpath(
                '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/a')
            source_long = 'Globe Newswire'
        # If the source is Business Wire, use another xpath to find elements
        elif source == 'bw':
            # Wait until the bottom image element loads before reading in data.
            WebDriverWait(browser, timeout). \
                until(EC.visibility_of_element_located((By.XPATH, '//*[@id="bw-group-all"]/div/div/div[3]/'
                                                                  'section/ul/li[last()]/p')))
            # Retrieve dates, title, desciption, url from each story
            date_elems = browser.find_elements_by_xpath(
                '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/div[1]/time'
            )
            title_elems = browser.find_elements_by_xpath(
                '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/h3/a')
            heading_elems = browser.find_elements_by_xpath(
                '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/p')
            url_elems = browser.find_elements_by_xpath(
                '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/h3/a')
            source_long = 'Business Wire'

        # Take text from each object and put in lists
        date_text = [elem.text for elem in date_elems]
        title_text = [elem.text.strip() for elem in title_elems]
        heading_text = [elem.text.strip() for elem in heading_elems]
        urls = [elem.get_attribute('href') for elem in url_elems]

        output = []
        for i, n in enumerate(urls):
            if is_english_story(n):
                ticker_object_list = get_ticker_objects_from_description(
                    heading_text[i])
                if ticker_object_list:
                    date = normalize_date_return_object(date_text[i])
                    article_object = NewsArticle(date, title_text[i],
                                                 ticker_object_list,
                                                 heading_text[i], n,
                                                 source_long)
                    output.append(article_object)
        return output

    except (TimeoutException) as e:
        print('TimeoutException')
        return None