def rename_unit_text_as_id(task_dir, text_folder='articles', id_key='article_id', log_filename='WSJ_text_market_util_log'): logger_dir = task_dir + 'logs/' log_filename = log_filename + '.txt' article_storage_dir = task_dir + text_folder + '/' article_content_folders = os.listdir(article_storage_dir) article_content_folders = [ i for i in article_content_folders if os.stat(article_storage_dir + i).st_size != 0 ] for an_article_date_folder in article_content_folders: article_list = os.listdir(article_storage_dir + an_article_date_folder) for an_article in article_list: article_full_dir = article_storage_dir + an_article_date_folder + '/' article_file_path_o = article_full_dir + an_article try: with open(article_file_path_o, 'r') as article_f: article_json = json.load(article_f) article_file_name_n = article_json[id_key] except KeyError as e: error_msg = f"{article_file_path_o} failed to change filename to '{id_key}' due to {e}." logger.register_log(error_msg, logger_dir, log_filename) continue article_file_path_n = article_full_dir + article_file_name_n + '.json' os.rename(str(article_file_path_o), str(article_file_path_n)) log_msg = f"{article_file_path_o} successfully renamed as '{article_file_name_n}'." logger.register_log(log_msg, logger_dir, log_filename)
def replace_quote_with_market_data_LUT_info( task_dir, text_folder='articles', market_data_folder='market_data', LUT_filename='company_market_LUT', LUT_quote_key='quoted_in', log_filename='WSJ_text_market_util_log'): logger_dir = task_dir + 'logs/' log_filename = log_filename + '.txt' article_full_path_dict = dict() article_storage_dir = task_dir + text_folder + '/' article_content_folders = os.listdir(article_storage_dir) article_content_folders = [ i for i in article_content_folders if os.stat(article_storage_dir + i).st_size != 0 ] for an_article_date_folder in article_content_folders: article_list = os.listdir(article_storage_dir + an_article_date_folder) for an_article in article_list: an_article_full_path = article_storage_dir + an_article_date_folder + '/' + an_article an_article = an_article.split('.')[0] article_full_path_dict[an_article] = an_article_full_path with open(an_article_full_path, 'r') as article_f_o: article_json = json.load(article_f_o) article_json['quotes'] = [] with open(an_article_full_path, 'w+') as article_f_n: json.dump(article_json, article_f_n, indent=4) LUT_dir = task_dir + market_data_folder + '/' + LUT_filename + '.json' with open(LUT_dir, 'r') as LUT_f: LUT_data = json.load(LUT_f) for k, v in LUT_data.items(): if len(v[LUT_quote_key]) != 0: for an_article_id in v[LUT_quote_key]: try: an_article_full_path = article_full_path_dict[ an_article_id] except KeyError as e: error_msg = f"{an_article_id} under {k} failed to retrieve full path from article_full_path_dict." logger.register_log(error_msg, logger_dir, log_filename) continue with open(an_article_full_path, 'r') as article_f_o: article_json = json.load(article_f_o) article_json['quotes'] = article_json['quotes'] + [str(k)] with open(an_article_full_path, 'w+') as article_f_n: json.dump(article_json, article_f_n, indent=4) log_msg = f"{an_article_full_path} successfully updated with {k} as quotes." logger.register_log(log_msg, logger_dir, log_filename)
def get_market_data_via_WSJ(driver, output_dir, retry_limit=1, google_search_result_collect_limit=3, scale_serp_api_key=None): market_data_output_dir = output_dir + 'market_data/' logger_dir = output_dir + 'logs/' log_filename = 'WSJ_market_data_log.txt' with open(market_data_output_dir + 'raw_company_market_LUT.json', 'r') as LUT_f: raw_company_market_LUT = json.load(LUT_f) company_market_LUT = dict() checked_url_list = [] for (a_company, v) in raw_company_market_LUT.items(): target_url_list = v['url'] googled_flag = False for an_url in target_url_list: if an_url in checked_url_list: continue retry_counter = 0 while True: tags = get_market_data_tags(driver, an_url) retry_counter += 1 if None not in tags or retry_counter > retry_limit: break checked_url_list.append(an_url) updated_LUT = register_tags_to_LUT(company_market_LUT, raw_company_market_LUT, tags, a_company, an_url, output_dir) if updated_LUT is not False: company_market_LUT = updated_LUT break else: if an_url == target_url_list[-1] and googled_flag == False: google_url_candidates, log_msg = get_market_data_via_google( a_company, google_search_result_collect_limit, scale_serp_api_key) target_url_list.extend(google_url_candidates) logger.register_log(log_msg, logger_dir, log_filename) googled_flag = True with open(market_data_output_dir + 'company_market_LUT.json', 'w') as output_f: json.dump(company_market_LUT, output_f, indent=4)
def scrape_headline_urls(driver, an_archive_url, logger_dir, log_filename): driver.get(an_archive_url) soup = BeautifulSoup(driver.page_source, 'lxml') archive_headline_url_tags = soup.find_all( 'h2', attrs={'class': 'WSJTheme--headline--unZqjb45'}) archive_headline_url_list = [ i.find('a')['href'] for i in archive_headline_url_tags ] stripped_archive_headline_url_list = [] for an_o_url in archive_headline_url_list: regex_match = re.search("(SB)\d+", an_o_url) if regex_match is None: log_msg = f"Hyperlink {an_o_url} not logged." logger.register_log(log_msg, logger_dir, log_filename) continue a_stripped_url = regex_match.group(0) stripped_archive_headline_url_list.append(a_stripped_url) return stripped_archive_headline_url_list
def collect_article_sentiment_analysis( task_dir, mentioned_article_dict, output_filename='mentioned_articles_sentiment_analysis', text_folder='articles', market_data_folder='market_data', log_filename='WSJ_dummy_model_log'): logger_dir = task_dir + 'logs/' log_filename = log_filename + '.txt' article_storage_dir = task_dir + text_folder + '/' sentiment_word_dict = sentiment_analyzer.get_sentiment_word_dict() mentioned_article_sentiment_dict = copy.deepcopy(mentioned_article_dict) for an_article_id, an_article_meta in mentioned_article_dict.items(): an_article_path = article_storage_dir + an_article_meta[ 'date'] + '/' + an_article_id + '.json' try: with open(an_article_path, 'r') as article_f: article_json = json.load(article_f) article_content = article_json['content'] except FileNotFoundError: error_msg = f"{an_article_id} not found with {an_article_path} (date: {an_article_meta['date']})." logger.register_log(error_msg, logger_dir, log_filename) an_article_sentiment_analysis_dict = sentiment_analyzer.get_sentiment_analysis( article_content, sentiment_word_dict, most_common_thld=100) mentioned_article_sentiment_dict[an_article_id][ 'sentiment_analysis'] = an_article_sentiment_analysis_dict log_msg = f"Registered {an_article_id} (date: {an_article_meta['date']}) sentiment analysis: {an_article_sentiment_analysis_dict}." logger.register_log(log_msg, logger_dir, log_filename) output_path = task_dir + market_data_folder + '/' + output_filename + '.json' with open(output_path, 'w+') as output_f: json.dump(mentioned_article_sentiment_dict, output_f, indent=4) return mentioned_article_sentiment_dict
def get_article_urls(driver, duration, output_dir): article_urls_storage_dir = output_dir + 'article_urls/' logger_dir = output_dir + 'logs/' log_filename = 'article_urls_log.txt' if not os.path.exists(article_urls_storage_dir): os.makedirs(article_urls_storage_dir) from_date = datetime.strptime(duration['start_time'], "%Y%m%d").date() to_date = datetime.strptime(duration['end_time'], "%Y%m%d").date() delta = to_date - from_date date_list = [] for i in range(delta.days + 1): date_list.append(from_date + timedelta(days=i)) archive_url_prefix = 'https://www.wsj.com/news/archive/' for a_date in date_list: an_archive_url = archive_url_prefix + a_date.strftime("%Y%m%d") # archive_page = request_session.get(a_archive_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}) # time.sleep(archive_load_sleep_time) # soup = BeautifulSoup(archive_page.text, 'lxml') daily_article_url_list = [] try: daily_article_url_list = scrape_headline_urls( driver, an_archive_url, logger_dir, log_filename) except AttributeError as e: log_msg = f'{a_date.strftime("%Y%m%d")} not retrieved due to {e}.' logger.register_log(log_msg, logger_dir, log_filename) continue while (len(daily_article_url_list) == 0): try: daily_article_url_list = scrape_headline_urls( driver, an_archive_url, logger_dir, log_filename) except (AttributeError, TypeError) as e: log_msg = f'{a_date.strftime("%Y%m%d")} not retrieved due to {e}.' logger.register_log(log_msg, logger_dir, log_filename) break with open( article_urls_storage_dir + a_date.strftime("%Y%m%d") + '.txt', 'w+') as f: f.write('\n'.join(daily_article_url_list)) log_msg = f'{a_date.strftime("%Y%m%d")} done, urls to {len(daily_article_url_list)} articles retrieved.' logger.register_log(log_msg, logger_dir, log_filename)
def register_tags_to_LUT(company_market_LUT, raw_company_market_LUT, tags, a_company, an_url, output_dir): logger_dir = output_dir + 'logs/' log_filename = 'WSJ_market_data_log.txt' valid_company_flag = True try: if len(tags) == 2: ticker_exchange_text = tags[0].get_text().strip() ticker_text = ticker_exchange_text.split('(')[0].strip() exchange_text = ticker_exchange_text.split('(')[1][:-1].strip() elif len(tags) == 3: ticker_text = tags[0].get_text().strip() exchange_text = tags[1].get_text().strip().strip('(').strip(')') else: raise AttributeError except AttributeError as e: ticker_text = 'ticker not on stock market' exchange_text = 'exchange not on stock market' valid_company_flag = False try: legal_full_name_text = tags[-1].get_text().strip() except AttributeError as e: legal_full_name_text = 'legal full name not on stock market' valid_company_flag = False US_market_flag = True if valid_company_flag == True: if 'U.S.' not in exchange_text: valid_company_flag = False US_market_flag = False if valid_company_flag == True: company_market_LUT[a_company] = dict() company_market_LUT[a_company]['market_data_url'] = an_url company_market_LUT[a_company]['ticker'] = ticker_text company_market_LUT[a_company]['exchange'] = exchange_text company_market_LUT[a_company]['legal_full_name'] = legal_full_name_text company_market_LUT[a_company]['quoted_in'] = raw_company_market_LUT[ a_company]['quoted_in'] log_msg = f"{a_company} successfully registered with information from {an_url} (quoted {len(company_market_LUT[a_company]['quoted_in'])}): ticker: {company_market_LUT[a_company]['ticker']}, exchange: {company_market_LUT[a_company]['exchange']}, legal_full_name: {company_market_LUT[a_company]['legal_full_name']}." logger.register_log(log_msg, logger_dir, log_filename) return company_market_LUT else: error_logged_flag = False if None in tags: error_log_msg = f"{a_company} failed to register with {an_url} due to tags being: {tags}." logger.register_log(error_log_msg, logger_dir, log_filename) error_logged_flag = True if US_market_flag == False: error_log_msg = f"{a_company} failed to register with {an_url} since exchange being non-US: {exchange_text}." logger.register_log(error_log_msg, logger_dir, log_filename) error_logged_flag = True if error_logged_flag == False: error_log_msg = f"{a_company} failed to register with {an_url} for unknown reasons." logger.register_log(error_log_msg, logger_dir, log_filename) return False
def calculate_company_sentiment_stats( task_dir, mentioned_article_sentiment_dict, output_filename='company_market_sentiment_LUT', market_data_folder='market_data', LUT_filename='company_market_LUT', log_filename='WSJ_dummy_model_log'): logger_dir = task_dir + 'logs/' log_filename = log_filename + '.txt' LUT_dir = task_dir + market_data_folder + '/' + LUT_filename + '.json' with open(LUT_dir, 'r') as LUT_f: LUT_data = json.load(LUT_f) company_market_sentiment_LUT = copy.deepcopy(LUT_data) for v in company_market_sentiment_LUT.values(): v['sentiment_indicator'] = dict() # ZeroDivisionError for a_company, a_company_info in LUT_data.items(): for an_article, an_article_info in a_company_info[ 'mentioned_in'].items(): an_article_sentiment = mentioned_article_sentiment_dict[ an_article]['sentiment_analysis'] for a_catagory in an_article_sentiment.keys(): an_article_sentiment[a_catagory] = an_article_sentiment[ a_catagory] * an_article_info['mentioned_time'] if an_article_info['date'] in company_market_sentiment_LUT[ a_company]['sentiment_indicator']: pre_counter = collections.Counter( company_market_sentiment_LUT[a_company] ['sentiment_indicator'][an_article_info['date']]) current_counter = collections.Counter(an_article_sentiment) updated_article_sentiment_dict = dict(pre_counter + current_counter) company_market_sentiment_LUT[a_company]['sentiment_indicator'][ an_article_info['date']] = updated_article_sentiment_dict else: company_market_sentiment_LUT[a_company]['sentiment_indicator'][ an_article_info['date']] = an_article_sentiment log_msg = f"Updated {an_article} (date: {an_article_info['date']}) sentiment analysis to {a_company}: {company_market_sentiment_LUT[a_company]['sentiment_indicator'][an_article_info['date']]}." logger.register_log(log_msg, logger_dir, log_filename) company_market_sentiment_LUT = add_trade_signal_to_LUT( company_market_sentiment_LUT) for a_company, a_company_info in company_market_sentiment_LUT.items(): company_market_sentiment_LUT[a_company]['total_actionable_days'] = len( a_company_info['sentiment_indicator'].keys()) company_market_sentiment_LUT[a_company]['total_mentioned_times'] = sum( [ v['mentioned_time'] for v in a_company_info['mentioned_in'].values() ]) company_market_sentiment_LUT[a_company]['sentiment_indicator'] = { k: v for k, v in sorted( a_company_info['sentiment_indicator'].items(), key=lambda date: datetime.strptime(date[0], "%Y%m%d")) } company_market_sentiment_LUT = { k: v for k, v in sorted(company_market_sentiment_LUT.items(), key=lambda x: x[1]['total_actionable_days'], reverse=True) } output_path = task_dir + market_data_folder + '/' + output_filename + '.json' with open(output_path, 'w+') as output_f: json.dump(company_market_sentiment_LUT, output_f, indent=4) log_msg = f"{output_filename} successfully exported to {output_path}." logger.register_log(log_msg, logger_dir, log_filename) clean_trade_signal_log = extract_clean_trade_signal_log( company_market_sentiment_LUT) output_path = task_dir + market_data_folder + '/' + 'clean_trade_signal_log' + '.json' with open(output_path, 'w+') as output_f: json.dump(clean_trade_signal_log, output_f, indent=4) log_msg = f"clean_trade_signal_log successfully exported to {output_path}." logger.register_log(log_msg, logger_dir, log_filename) return mentioned_article_sentiment_dict
def collect_company_mention_stats(task_dir, text_folder='articles', market_data_folder='market_data', LUT_filename='company_market_LUT', log_filename='WSJ_data_cleaner_util_log'): logger_dir = task_dir + 'logs/' log_filename = log_filename + '.txt' LUT_dir = task_dir + market_data_folder + '/' + LUT_filename + '.json' with open(LUT_dir, 'r') as LUT_f: LUT_data = json.load(LUT_f) for v in LUT_data.values(): v['mentioned_in'] = dict() article_storage_dir = task_dir + text_folder + '/' article_content_folders = os.listdir(article_storage_dir) article_content_folders = [ i for i in article_content_folders if os.stat(article_storage_dir + i).st_size != 0 ] for an_article_date_folder in article_content_folders: article_list = os.listdir(article_storage_dir + an_article_date_folder) for an_article in article_list: article_full_dir = article_storage_dir + an_article_date_folder + '/' an_article_file_path = article_full_dir + an_article with open(an_article_file_path, 'r') as article_f: article_json = json.load(article_f) article_json['mention'] = dict() for a_company in LUT_data.keys(): a_company_mentioned_feq = str(article_json['content']).count( str(a_company)) a_company_mentioned_deduction = 0 for letter in string.ascii_lowercase: a_company_mentioned_deduction += str( article_json['content']).count( str(a_company) + letter) a_company_mentioned_deduction += str( article_json['content']).count(letter + str(a_company)) # content = str(article_json['content']).lower() # content_no_punctu = content.translate(str.maketrans('', '', string.punctuation)) a_company_mentioned_feq -= a_company_mentioned_deduction if a_company_mentioned_feq > 0: article_json['mention'][ a_company] = a_company_mentioned_feq # LUT_data[a_company]['mentioned_in'][article_json['article_id']] = LUT_data[a_company]['mentioned_in'][article_json['article_id']] + a_company_mentioned_feq LUT_data[a_company]['mentioned_in'][ article_json['article_id']] = { 'mentioned_time': article_json['mention'][a_company], 'date': article_json['date'] } log_msg = f"Registered {an_article_file_path} mentions of {a_company} for {article_json['mention'][a_company]} times." logger.register_log(log_msg, logger_dir, log_filename) else: unfound_msg = f"Found {an_article_file_path} has no ({a_company_mentioned_feq}) mention of {a_company}." # logger.register_log(unfound_msg, logger_dir, log_filename) continue with open(an_article_file_path, 'w+') as article_f_n: json.dump(article_json, article_f_n, indent=4) log_msg = f"Rewrite {an_article_file_path} with updated mention data." logger.register_log(log_msg, logger_dir, log_filename) with open(LUT_dir, 'w+') as LUT_f_n: json.dump(LUT_data, LUT_f_n, indent=4) log_msg = f"Rewrite {LUT_dir} with updated mentioned_in data." logger.register_log(log_msg, logger_dir, log_filename)
def get_articles(driver, output_dir): url_storage_dir_prefix = output_dir + 'article_urls/' article_storage_dir_prefix = output_dir + 'articles/' logger_dir = output_dir + 'logs/' log_filename = 'articles_log.txt' if not os.path.exists(article_storage_dir_prefix): os.makedirs(article_storage_dir_prefix) url_file_list = os.listdir(url_storage_dir_prefix) url_file_list = [ i for i in url_file_list if os.stat(url_storage_dir_prefix + i).st_size != 0 ] url_file_list = sorted(url_file_list) article_url_prefix = 'https://www.wsj.com/articles/' for an_url_file in url_file_list: a_date = an_url_file[:-4] article_storage_dir = article_storage_dir_prefix + '/' + a_date + '/' if not os.path.exists(article_storage_dir): os.makedirs(article_storage_dir) with open(url_storage_dir_prefix + an_url_file) as url_f: article_url_list = [i for i in url_f] url_counter = 0 for an_article_url_suffix in article_url_list: an_article_url_suffix = an_article_url_suffix.strip() an_article_url = article_url_prefix + an_article_url_suffix driver.get(an_article_url) soup = BeautifulSoup(driver.page_source, 'lxml') json_output = { 'channel': 'WSJ', 'date': a_date, 'url': an_article_url } quote_tags = soup.find_all('a', href=True) quote_list = [] for i in quote_tags: url = i['href'] text = i.text if 'market-data/quotes/' in url and len( url.split('/')) == 6 and '?mod=' not in url: quote_list.append((text, url)) author_tag = soup.find("span", {"class": "author-name"}) headline_tag = soup.find("h1", {"class": "wsj-article-headline"}) author = None headline = None try: author = author_tag.get_text().strip() headline = headline_tag.get_text().strip() except AttributeError as e: error_log = f"{a_date}'s {an_article_url} failed author: '{author}' or headline: '{headline}' retrieval due to {e}." logger.register_log(error_log, logger_dir, log_filename) content_tag = soup.find("div", {"class": "article-content"}) try: content = content_tag.get_text().strip() except AttributeError as e: error_log = f"{a_date}'s {an_article_url} failed during content retrieval due to {e} (content tag)." logger.register_log(error_log, logger_dir, log_filename) continue json_output['author'] = author json_output['headline'] = headline json_output['quotes'] = quote_list json_output['content'] = content # with open(article_storage_dir + an_article_url_suffix + '-quotes.txt', 'w+') as quote_f: # quote_f.write('\n'.join(quote_list)) with open(article_storage_dir + an_article_url_suffix + '.json', 'w+') as output_f: # content_f.write(content) json.dump(json_output, output_f, indent=4) url_counter += 1 output_log = f"{a_date}'s {an_article_url_suffix} done (quotes: {len(quote_list)}; content: {len(content)}) | #{url_counter}/{len(article_url_list)}" logger.register_log(output_log, logger_dir, log_filename)