def ping_gnw_rss_news_feed(url): """ Pings a Global Newswire rss feed and returns all articles present :param url: url of RSS feed :return: list of article objects """ feed = feedparser.parse(url) output = [] for entry in feed.entries: ticker_object_list = False for i, tag in enumerate(entry.tags): if get_ticker_objects_from_description(entry.tags[i].term): ticker_object_list = get_ticker_objects_from_description( entry.tags[i].term) if ticker_object_list and entry.language == 'en': date_time_utc = entry.published date_time_utc_object = datetime.datetime.strptime(date_time_utc, '%a, %d %b %Y %H:%M %Z') \ .replace(tzinfo=timezone('UTC')) date_time_eastern_object = date_time_utc_object.astimezone( timezone('US/Eastern')) description_html = entry.description description = re.sub('<[^<]+?>', '', description_html) title = entry.title link = entry.link.split('?')[0] news_article = NewsArticle(date_time_eastern_object, title, ticker_object_list, description, link, 'Globe Newswire') output.append(news_article) return output
def ping_bus_wire_rss_news_feed(url): """ Pull all articles from the RSS feed that have a stock ticker in the article :param url: url of RSS Feed :return: List of article objects """ feed = feedparser.parse(url) output = [] for entry in feed.entries: description = entry.description ticker_object_list = get_ticker_objects_from_description(description) if ticker_object_list: date_time_utc = entry.published + 'C' date_time_utc_object = datetime.datetime.strptime(date_time_utc, '%a, %d %b %Y %H:%M:%S %Z') \ .replace(tzinfo=timezone('UTC')) date_time_eastern_object = date_time_utc_object.astimezone( timezone('US/Eastern')) date_time_pt_object = date_time_utc_object.astimezone( timezone('US/Pacific')) title = entry.title link = entry.link.split('?')[0] if is_english_story(link): news_article = NewsArticle(date_time_eastern_object, title, ticker_object_list, description, link, 'Business Wire') output.append(news_article) return output
def find_story_from_ticker_date(ticker, date_begin_string, browser, exchange='', date_end_string=''): """ Searches GlobeNewswire for by ticker and date. :param ticker: Company ticker :param date_begin_string: Begin date for search. If there is no end date, this is also the end date. :param browser: Browser object :param exchange: Exchange on which the ticker is traded :param date_end_string: (optional) End date of search :return: List of article objects """ # Turn date_string into object for comparison date_start_object = normalize_date_return_object(date_begin_string) date_start_str = date_start_object.strftime('%Y-%m-%d') if date_end_string != '': date_end_object = normalize_date_return_object(date_end_string) date_end_str = date_end_object.strftime('%Y-%m-%d') else: date_end_str = date_start_str url_page = 1 all_stories = [] next_page_button = True # While the there is a "Next Page" button on the page, keep paginating while next_page_button: keyword = ticker if exchange: keyword = keyword + "," + exchange url = 'https://www.globenewswire.com/search/lang/en/exchange/nyse,nasdaq/date/[' + date_start_str + \ '%2520TO%2520' + date_end_str + ']/keyword/' + keyword + \ '?page=' + str(url_page) next_page_button = check_for_next_page_button(browser) search_page_details = get_stories_from_search_page(url, browser) for story in search_page_details: # TODO - add the ability to also ensure the exchange is the same tickers_in_story = get_ticker_objects_from_description( story.description) tickers_only = [i.ticker for i in tickers_in_story] if ticker in tickers_only: all_stories.append(story) url_page += 1 return all_stories
def get_stories_from_search_page(url, browser): """ Retrieve all articles that exist on a search page :param url: URL of search :param browser: Browser object :return: List of article objects """ browser.get(url) timeout = 20 try: if check_for_no_stories(browser): print(check_for_no_stories()) return None # Wait until the bottom image element loads before reading in data. WebDriverWait(browser, timeout). \ until(EC.visibility_of_element_located((By.XPATH, '//*[@id="bw-group-all"]/div/div/div[3]/' 'section/ul/li[last()]/p'))) # Retrieve dates, title, desciption, url from each story date_elems = browser.find_elements_by_xpath( '//*[@id="bw-group-all"]/div/div/div[3]/section/' 'ul/li[*]/div[1]/time') title_elems = browser.find_elements_by_xpath( '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/h3/a') heading_elems = browser.find_elements_by_xpath( '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/p') url_elems = browser.find_elements_by_xpath( '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/h3/a') # Take text from each object and put in lists date_text = [elem.text for elem in date_elems] title_text = [elem.text for elem in title_elems] heading_text = [elem.text for elem in heading_elems] urls = [elem.get_attribute('href') for elem in url_elems] output = [] for i, n in enumerate(urls): ticker_object_list = get_ticker_objects_from_description( heading_text[i]) if is_english_story(urls[i]) and ticker_object_list: date = normalize_date_return_object(date_text[i]) article_object = NewsArticle(date[i], title_text[i], ticker_object_list[i], heading_text[i], urls[i], 'BusinessWire') output.append(article_object) return output except TimeoutException: return []
def pull_article_gnw(url): """ Pull the article text from a GlobeNewsire url :param url: GlobeNewswire URL :return: Article text """ try: page = urlopen(url) soup = BeautifulSoup(page, 'html.parser') all_page_text = soup.find('div', id='main-body-container') # Sometimes title is h2, sometimes it is h1 title = soup.find('h2') if title is None: title = soup.find('h1') title = title.text date_time_str = soup.find('time')['datetime'] date_time_utc_object = datetime.datetime.strptime(date_time_str, '%Y-%m-%dT%H:%M:%SZ').\ replace(tzinfo=timezone('UTC')) date_time_eastern_object = date_time_utc_object.astimezone( timezone('US/Eastern')) p_elems_all = all_page_text.findAll('p') split_index = len(p_elems_all) for i, p in enumerate(p_elems_all): # Find <p> element that starts with "About" to split if re.match('^[ |\n]*about', p.text.lower()): split_index = i break p_elems_article = p_elems_all[:split_index] description = p_elems_article[0].text article_text = ' '.join([p.text for p in p_elems_article]) tickers = get_ticker_objects_from_description(article_text) return { 'article_object': NewsArticle(date_time_eastern_object, title, tickers, description, url, 'Globe Newswire'), 'article_text': article_text } except (AttributeError, HTTPError, TimeoutError, NoneType) as e: print(e) return None
def get_stories_from_search_page(url, browser): """ Returns all stories from the current search page :param url: url of search results :param browser: Browser parameter :return: List of article objects """ browser.get(url) timeout = 20 try: if check_for_no_stories(browser): return None # Wait until the bottom image element loads before reading in data. WebDriverWait(browser, timeout). \ until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div/div[2]/span'))) # Retrieve dates, title, desrciption, url from each story date_elems = browser.find_elements_by_xpath( '/html/body/div[1]/div[2]/div/div[*]/div/div[2]/div/span[1]') title_elems = browser.find_elements_by_xpath( '/html/body/div[1]/div[2]/div/div[*]/div/div[2]/a') heading_elems = browser.find_elements_by_xpath( '/html/body/div[1]/div[2]/div/div[*]/div/div[2]/span') url_elems = browser.find_elements_by_xpath( '/html/body/div[1]/div[2]/div/div[*]/div/div[2]/a') # Take text from each object and put in lists date_text = [elem.text for elem in date_elems] title_text = [elem.text for elem in title_elems] heading_text = [elem.text for elem in heading_elems] urls = [elem.get_attribute('href') for elem in url_elems] output = [] for i, n in enumerate(urls): ticker_object_list = get_ticker_objects_from_description( heading_text[i]) if ticker_object_list: date = normalize_date_return_object(date_text[i]) article_object = NewsArticle(date[i], title_text[i], ticker_object_list[i], heading_text[i], urls[i], 'Globe Newswire') output.append(article_object) return output except TimeoutException: return []
def pull_article_bw(url): """ Pull the article text from a Business Wire url :param url: Business Wire URL :return: Article text """ try: req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) page = urlopen(req).read() soup = BeautifulSoup(page, 'html.parser') title = soup.find('h1').text date_time_str = soup.find('time')['datetime'] date_time_utc_object = datetime.datetime.strptime(date_time_str, '%Y-%m-%dT%H:%M:%SZ').\ replace(tzinfo=timezone('UTC')) date_time_eastern_object = date_time_utc_object.astimezone( timezone('US/Eastern')) all_page_text = soup.find('div', itemprop='articleBody') p_elems_all = all_page_text.findAll('p') split_index = len(p_elems_all) for i, p in enumerate(p_elems_all): # Find <p> element that starts with "About" to split if re.match('^[ |\n]*about', p.text.lower()): split_index = i break p_elems_article = p_elems_all[:split_index] description = p_elems_article[0].text article_text = ' '.join([p.text for p in p_elems_article]) tickers = get_ticker_objects_from_description(article_text) return { 'article_object': NewsArticle(date_time_eastern_object, title, tickers, description, url, 'Business Wire'), 'article_text': article_text } except (AttributeError, HTTPError, TimeoutError): return None
def filter_gnw_news_feed_with_nonsequential_keywords(url, keywords): """ Filter articles using non-sequential keywords # NOTE: THIS IS UNTESTED :param url: url of RSS feed :param keywords: list of keywords :return: list of article objects """ news_stories = ping_gnw_rss_news_feed(url) output = [] for entry in news_stories: description = entry['description'] ticker_object = get_ticker_objects_from_description(description) if ticker_object: for keyword in keywords: if keyword in description and keyword == keywords[-1]: output.append(entry) else: # If any of the keywords do not match, go to next news story break return output
def pull_daily_change_for_all_gnw_articles(csv_input, csv_output): """ Pull daily stock change for all articles referenced in a csv :param csv_input: Name of input CSV :param csv_output: Name of output CSV :return: Nothing """ header = [ 'date', 'title', 'description', 'percent_change', 'max_percent_change', 'volume' ] output = [] with open(csv_input, 'r') as csv_in: csv_reader = csv.reader(csv_in) header_throwaway = next(csv_reader) with open(csv_output, 'w') as csv_out: csv_writer = csv.writer(csv_out) csv_writer.writerow(header) for row in csv_reader: [date, title, description] = row ticker_objects = get_ticker_objects_from_description( description) if ticker_objects: date_str = convert_text_date_for_api(date) for ticker_object in ticker_objects: ticker = ticker_object.ticker stock_day_data = get_data_ticker_date_iex( ticker, date_str) if stock_day_data: volume = stock_day_data['volume'] percent_change = stock_day_data['percent_change'] max_percent_change = stock_day_data[ 'max_percent_change'] row.extend( [percent_change, max_percent_change, volume]) csv_writer.writerow(row)
def pull_daily_change_for_all_bus_wire_articles(csv_input, csv_output): """ Pull stock market data for all ticker/date combos in a file and return a file with that data :param csv_input: Input csv name :param csv_output: Output csv name :return: Nothing """ header = [ 'date', 'title', 'description', 'percent_change', 'max_percent_change', 'volume' ] output = [] with open(csv_input, 'r') as csv_in: csv_reader = csv.reader(csv_in) header_throwaway = next(csv_reader) with open(csv_output, 'w') as csv_out: csv_writer = csv.writer(csv_out) csv_writer.writerow(header) for row in csv_reader: [date, title, description] = row ticker_objects = get_ticker_objects_from_description( description) if ticker_objects: date_str = convert_text_date_for_api(date) for ticker_object in ticker_objects: ticker = ticker_object.ticker stock_day_data = get_data_ticker_date_iex( ticker, date_str) if stock_day_data: volume = stock_day_data['volume'] percent_change = stock_day_data['percent_change'] max_percent_change = stock_day_data[ 'max_percent_change'] row.extend( [percent_change, max_percent_change, volume]) csv_writer.writerow(row)
def get_stories_from_search_page(url, source, browser): """ Returns all stories from the current search page :param url: url of search results :param browser: Browser parameter :return: List of article objects """ browser.get(url) timeout = 20 try: # If the source is Globe Newswire, use one xpath to find elements if check_for_no_stories(source, browser): return None if source == 'gnw': # Wait until the bottom image element loads before reading in data. WebDriverWait(browser, timeout). \ until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div[*]/div[1]/div[2]/span'))) # Retrieve dates, title, desrciption, url from each story date_elems = browser.find_elements_by_xpath( '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/div/span[1]' ) '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/div/span[1]' title_elems = browser.find_elements_by_xpath( '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/a') heading_elems = browser.find_elements_by_xpath( '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/span') url_elems = browser.find_elements_by_xpath( '/html/body/div[1]/div[2]/div/div[2]/div[*]/div/div[2]/a') source_long = 'Globe Newswire' # If the source is Business Wire, use another xpath to find elements elif source == 'bw': # Wait until the bottom image element loads before reading in data. WebDriverWait(browser, timeout). \ until(EC.visibility_of_element_located((By.XPATH, '//*[@id="bw-group-all"]/div/div/div[3]/' 'section/ul/li[last()]/p'))) # Retrieve dates, title, desciption, url from each story date_elems = browser.find_elements_by_xpath( '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/div[1]/time' ) title_elems = browser.find_elements_by_xpath( '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/h3/a') heading_elems = browser.find_elements_by_xpath( '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/p') url_elems = browser.find_elements_by_xpath( '//*[@id="bw-group-all"]/div/div/div[3]/section/ul/li[*]/h3/a') source_long = 'Business Wire' # Take text from each object and put in lists date_text = [elem.text for elem in date_elems] title_text = [elem.text.strip() for elem in title_elems] heading_text = [elem.text.strip() for elem in heading_elems] urls = [elem.get_attribute('href') for elem in url_elems] output = [] for i, n in enumerate(urls): if is_english_story(n): ticker_object_list = get_ticker_objects_from_description( heading_text[i]) if ticker_object_list: date = normalize_date_return_object(date_text[i]) article_object = NewsArticle(date, title_text[i], ticker_object_list, heading_text[i], n, source_long) output.append(article_object) return output except (TimeoutException) as e: print('TimeoutException') return None