def rename_unit_text_as_id(task_dir,
                           text_folder='articles',
                           id_key='article_id',
                           log_filename='WSJ_text_market_util_log'):
    logger_dir = task_dir + 'logs/'
    log_filename = log_filename + '.txt'

    article_storage_dir = task_dir + text_folder + '/'
    article_content_folders = os.listdir(article_storage_dir)
    article_content_folders = [
        i for i in article_content_folders
        if os.stat(article_storage_dir + i).st_size != 0
    ]

    for an_article_date_folder in article_content_folders:
        article_list = os.listdir(article_storage_dir + an_article_date_folder)

        for an_article in article_list:
            article_full_dir = article_storage_dir + an_article_date_folder + '/'
            article_file_path_o = article_full_dir + an_article
            try:
                with open(article_file_path_o, 'r') as article_f:
                    article_json = json.load(article_f)
                    article_file_name_n = article_json[id_key]
            except KeyError as e:
                error_msg = f"{article_file_path_o} failed to change filename to '{id_key}' due to {e}."
                logger.register_log(error_msg, logger_dir, log_filename)
                continue
            article_file_path_n = article_full_dir + article_file_name_n + '.json'

            os.rename(str(article_file_path_o), str(article_file_path_n))
            log_msg = f"{article_file_path_o} successfully renamed as '{article_file_name_n}'."
            logger.register_log(log_msg, logger_dir, log_filename)
def replace_quote_with_market_data_LUT_info(
        task_dir,
        text_folder='articles',
        market_data_folder='market_data',
        LUT_filename='company_market_LUT',
        LUT_quote_key='quoted_in',
        log_filename='WSJ_text_market_util_log'):
    logger_dir = task_dir + 'logs/'
    log_filename = log_filename + '.txt'

    article_full_path_dict = dict()
    article_storage_dir = task_dir + text_folder + '/'
    article_content_folders = os.listdir(article_storage_dir)
    article_content_folders = [
        i for i in article_content_folders
        if os.stat(article_storage_dir + i).st_size != 0
    ]
    for an_article_date_folder in article_content_folders:
        article_list = os.listdir(article_storage_dir + an_article_date_folder)
        for an_article in article_list:
            an_article_full_path = article_storage_dir + an_article_date_folder + '/' + an_article
            an_article = an_article.split('.')[0]
            article_full_path_dict[an_article] = an_article_full_path
            with open(an_article_full_path, 'r') as article_f_o:
                article_json = json.load(article_f_o)
                article_json['quotes'] = []
            with open(an_article_full_path, 'w+') as article_f_n:
                json.dump(article_json, article_f_n, indent=4)

    LUT_dir = task_dir + market_data_folder + '/' + LUT_filename + '.json'
    with open(LUT_dir, 'r') as LUT_f:
        LUT_data = json.load(LUT_f)

    for k, v in LUT_data.items():
        if len(v[LUT_quote_key]) != 0:
            for an_article_id in v[LUT_quote_key]:
                try:
                    an_article_full_path = article_full_path_dict[
                        an_article_id]
                except KeyError as e:
                    error_msg = f"{an_article_id} under {k} failed to retrieve full path from article_full_path_dict."
                    logger.register_log(error_msg, logger_dir, log_filename)
                    continue

                with open(an_article_full_path, 'r') as article_f_o:
                    article_json = json.load(article_f_o)
                    article_json['quotes'] = article_json['quotes'] + [str(k)]
                with open(an_article_full_path, 'w+') as article_f_n:
                    json.dump(article_json, article_f_n, indent=4)

                log_msg = f"{an_article_full_path} successfully updated with {k} as quotes."
                logger.register_log(log_msg, logger_dir, log_filename)
def get_market_data_via_WSJ(driver,
                            output_dir,
                            retry_limit=1,
                            google_search_result_collect_limit=3,
                            scale_serp_api_key=None):
    market_data_output_dir = output_dir + 'market_data/'
    logger_dir = output_dir + 'logs/'
    log_filename = 'WSJ_market_data_log.txt'

    with open(market_data_output_dir + 'raw_company_market_LUT.json',
              'r') as LUT_f:
        raw_company_market_LUT = json.load(LUT_f)

    company_market_LUT = dict()
    checked_url_list = []
    for (a_company, v) in raw_company_market_LUT.items():

        target_url_list = v['url']
        googled_flag = False
        for an_url in target_url_list:
            if an_url in checked_url_list:
                continue

            retry_counter = 0
            while True:
                tags = get_market_data_tags(driver, an_url)
                retry_counter += 1
                if None not in tags or retry_counter > retry_limit:
                    break
            checked_url_list.append(an_url)

            updated_LUT = register_tags_to_LUT(company_market_LUT,
                                               raw_company_market_LUT, tags,
                                               a_company, an_url, output_dir)

            if updated_LUT is not False:
                company_market_LUT = updated_LUT
                break
            else:
                if an_url == target_url_list[-1] and googled_flag == False:
                    google_url_candidates, log_msg = get_market_data_via_google(
                        a_company, google_search_result_collect_limit,
                        scale_serp_api_key)
                    target_url_list.extend(google_url_candidates)
                    logger.register_log(log_msg, logger_dir, log_filename)
                    googled_flag = True

    with open(market_data_output_dir + 'company_market_LUT.json',
              'w') as output_f:
        json.dump(company_market_LUT, output_f, indent=4)
Пример #4
0
def scrape_headline_urls(driver, an_archive_url, logger_dir, log_filename):
    driver.get(an_archive_url)
    soup = BeautifulSoup(driver.page_source, 'lxml')

    archive_headline_url_tags = soup.find_all(
        'h2', attrs={'class': 'WSJTheme--headline--unZqjb45'})
    archive_headline_url_list = [
        i.find('a')['href'] for i in archive_headline_url_tags
    ]

    stripped_archive_headline_url_list = []
    for an_o_url in archive_headline_url_list:
        regex_match = re.search("(SB)\d+", an_o_url)
        if regex_match is None:
            log_msg = f"Hyperlink {an_o_url} not logged."
            logger.register_log(log_msg, logger_dir, log_filename)
            continue
        a_stripped_url = regex_match.group(0)
        stripped_archive_headline_url_list.append(a_stripped_url)

    return stripped_archive_headline_url_list
def collect_article_sentiment_analysis(
        task_dir,
        mentioned_article_dict,
        output_filename='mentioned_articles_sentiment_analysis',
        text_folder='articles',
        market_data_folder='market_data',
        log_filename='WSJ_dummy_model_log'):
    logger_dir = task_dir + 'logs/'
    log_filename = log_filename + '.txt'
    article_storage_dir = task_dir + text_folder + '/'

    sentiment_word_dict = sentiment_analyzer.get_sentiment_word_dict()

    mentioned_article_sentiment_dict = copy.deepcopy(mentioned_article_dict)

    for an_article_id, an_article_meta in mentioned_article_dict.items():
        an_article_path = article_storage_dir + an_article_meta[
            'date'] + '/' + an_article_id + '.json'

        try:
            with open(an_article_path, 'r') as article_f:
                article_json = json.load(article_f)
                article_content = article_json['content']
        except FileNotFoundError:
            error_msg = f"{an_article_id} not found with {an_article_path} (date: {an_article_meta['date']})."
            logger.register_log(error_msg, logger_dir, log_filename)

        an_article_sentiment_analysis_dict = sentiment_analyzer.get_sentiment_analysis(
            article_content, sentiment_word_dict, most_common_thld=100)

        mentioned_article_sentiment_dict[an_article_id][
            'sentiment_analysis'] = an_article_sentiment_analysis_dict
        log_msg = f"Registered {an_article_id} (date: {an_article_meta['date']}) sentiment analysis: {an_article_sentiment_analysis_dict}."
        logger.register_log(log_msg, logger_dir, log_filename)

    output_path = task_dir + market_data_folder + '/' + output_filename + '.json'
    with open(output_path, 'w+') as output_f:
        json.dump(mentioned_article_sentiment_dict, output_f, indent=4)
    return mentioned_article_sentiment_dict
Пример #6
0
def get_article_urls(driver, duration, output_dir):
    article_urls_storage_dir = output_dir + 'article_urls/'
    logger_dir = output_dir + 'logs/'
    log_filename = 'article_urls_log.txt'
    if not os.path.exists(article_urls_storage_dir):
        os.makedirs(article_urls_storage_dir)

    from_date = datetime.strptime(duration['start_time'], "%Y%m%d").date()
    to_date = datetime.strptime(duration['end_time'], "%Y%m%d").date()
    delta = to_date - from_date
    date_list = []
    for i in range(delta.days + 1):
        date_list.append(from_date + timedelta(days=i))

    archive_url_prefix = 'https://www.wsj.com/news/archive/'
    for a_date in date_list:
        an_archive_url = archive_url_prefix + a_date.strftime("%Y%m%d")

        #     archive_page = request_session.get(a_archive_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'})
        #     time.sleep(archive_load_sleep_time)
        #     soup = BeautifulSoup(archive_page.text, 'lxml')

        daily_article_url_list = []
        try:
            daily_article_url_list = scrape_headline_urls(
                driver, an_archive_url, logger_dir, log_filename)
        except AttributeError as e:
            log_msg = f'{a_date.strftime("%Y%m%d")} not retrieved due to {e}.'
            logger.register_log(log_msg, logger_dir, log_filename)
            continue
        while (len(daily_article_url_list) == 0):
            try:
                daily_article_url_list = scrape_headline_urls(
                    driver, an_archive_url, logger_dir, log_filename)
            except (AttributeError, TypeError) as e:
                log_msg = f'{a_date.strftime("%Y%m%d")} not retrieved due to {e}.'
                logger.register_log(log_msg, logger_dir, log_filename)
                break

        with open(
                article_urls_storage_dir + a_date.strftime("%Y%m%d") + '.txt',
                'w+') as f:
            f.write('\n'.join(daily_article_url_list))

        log_msg = f'{a_date.strftime("%Y%m%d")} done, urls to {len(daily_article_url_list)} articles retrieved.'
        logger.register_log(log_msg, logger_dir, log_filename)
def register_tags_to_LUT(company_market_LUT, raw_company_market_LUT, tags,
                         a_company, an_url, output_dir):
    logger_dir = output_dir + 'logs/'
    log_filename = 'WSJ_market_data_log.txt'

    valid_company_flag = True
    try:
        if len(tags) == 2:
            ticker_exchange_text = tags[0].get_text().strip()
            ticker_text = ticker_exchange_text.split('(')[0].strip()
            exchange_text = ticker_exchange_text.split('(')[1][:-1].strip()
        elif len(tags) == 3:
            ticker_text = tags[0].get_text().strip()
            exchange_text = tags[1].get_text().strip().strip('(').strip(')')
        else:
            raise AttributeError

    except AttributeError as e:
        ticker_text = 'ticker not on stock market'
        exchange_text = 'exchange not on stock market'
        valid_company_flag = False
    try:
        legal_full_name_text = tags[-1].get_text().strip()
    except AttributeError as e:
        legal_full_name_text = 'legal full name not on stock market'
        valid_company_flag = False

    US_market_flag = True
    if valid_company_flag == True:
        if 'U.S.' not in exchange_text:
            valid_company_flag = False
            US_market_flag = False

    if valid_company_flag == True:
        company_market_LUT[a_company] = dict()
        company_market_LUT[a_company]['market_data_url'] = an_url
        company_market_LUT[a_company]['ticker'] = ticker_text
        company_market_LUT[a_company]['exchange'] = exchange_text
        company_market_LUT[a_company]['legal_full_name'] = legal_full_name_text
        company_market_LUT[a_company]['quoted_in'] = raw_company_market_LUT[
            a_company]['quoted_in']

        log_msg = f"{a_company} successfully registered with information from {an_url} (quoted {len(company_market_LUT[a_company]['quoted_in'])}): ticker: {company_market_LUT[a_company]['ticker']}, exchange: {company_market_LUT[a_company]['exchange']}, legal_full_name: {company_market_LUT[a_company]['legal_full_name']}."
        logger.register_log(log_msg, logger_dir, log_filename)

        return company_market_LUT

    else:
        error_logged_flag = False
        if None in tags:
            error_log_msg = f"{a_company} failed to register with {an_url} due to tags being: {tags}."
            logger.register_log(error_log_msg, logger_dir, log_filename)
            error_logged_flag = True

        if US_market_flag == False:
            error_log_msg = f"{a_company} failed to register with {an_url} since exchange being non-US: {exchange_text}."
            logger.register_log(error_log_msg, logger_dir, log_filename)
            error_logged_flag = True

        if error_logged_flag == False:
            error_log_msg = f"{a_company} failed to register with {an_url} for unknown reasons."
            logger.register_log(error_log_msg, logger_dir, log_filename)

        return False
def calculate_company_sentiment_stats(
        task_dir,
        mentioned_article_sentiment_dict,
        output_filename='company_market_sentiment_LUT',
        market_data_folder='market_data',
        LUT_filename='company_market_LUT',
        log_filename='WSJ_dummy_model_log'):
    logger_dir = task_dir + 'logs/'
    log_filename = log_filename + '.txt'

    LUT_dir = task_dir + market_data_folder + '/' + LUT_filename + '.json'
    with open(LUT_dir, 'r') as LUT_f:
        LUT_data = json.load(LUT_f)
    company_market_sentiment_LUT = copy.deepcopy(LUT_data)

    for v in company_market_sentiment_LUT.values():
        v['sentiment_indicator'] = dict()

    # ZeroDivisionError
    for a_company, a_company_info in LUT_data.items():
        for an_article, an_article_info in a_company_info[
                'mentioned_in'].items():
            an_article_sentiment = mentioned_article_sentiment_dict[
                an_article]['sentiment_analysis']

            for a_catagory in an_article_sentiment.keys():
                an_article_sentiment[a_catagory] = an_article_sentiment[
                    a_catagory] * an_article_info['mentioned_time']

            if an_article_info['date'] in company_market_sentiment_LUT[
                    a_company]['sentiment_indicator']:
                pre_counter = collections.Counter(
                    company_market_sentiment_LUT[a_company]
                    ['sentiment_indicator'][an_article_info['date']])
                current_counter = collections.Counter(an_article_sentiment)
                updated_article_sentiment_dict = dict(pre_counter +
                                                      current_counter)
                company_market_sentiment_LUT[a_company]['sentiment_indicator'][
                    an_article_info['date']] = updated_article_sentiment_dict
            else:
                company_market_sentiment_LUT[a_company]['sentiment_indicator'][
                    an_article_info['date']] = an_article_sentiment

            log_msg = f"Updated {an_article} (date: {an_article_info['date']}) sentiment analysis to {a_company}: {company_market_sentiment_LUT[a_company]['sentiment_indicator'][an_article_info['date']]}."
            logger.register_log(log_msg, logger_dir, log_filename)

    company_market_sentiment_LUT = add_trade_signal_to_LUT(
        company_market_sentiment_LUT)

    for a_company, a_company_info in company_market_sentiment_LUT.items():
        company_market_sentiment_LUT[a_company]['total_actionable_days'] = len(
            a_company_info['sentiment_indicator'].keys())
        company_market_sentiment_LUT[a_company]['total_mentioned_times'] = sum(
            [
                v['mentioned_time']
                for v in a_company_info['mentioned_in'].values()
            ])

        company_market_sentiment_LUT[a_company]['sentiment_indicator'] = {
            k: v
            for k, v in sorted(
                a_company_info['sentiment_indicator'].items(),
                key=lambda date: datetime.strptime(date[0], "%Y%m%d"))
        }

    company_market_sentiment_LUT = {
        k: v
        for k, v in sorted(company_market_sentiment_LUT.items(),
                           key=lambda x: x[1]['total_actionable_days'],
                           reverse=True)
    }

    output_path = task_dir + market_data_folder + '/' + output_filename + '.json'
    with open(output_path, 'w+') as output_f:
        json.dump(company_market_sentiment_LUT, output_f, indent=4)

    log_msg = f"{output_filename} successfully exported to {output_path}."
    logger.register_log(log_msg, logger_dir, log_filename)

    clean_trade_signal_log = extract_clean_trade_signal_log(
        company_market_sentiment_LUT)
    output_path = task_dir + market_data_folder + '/' + 'clean_trade_signal_log' + '.json'
    with open(output_path, 'w+') as output_f:
        json.dump(clean_trade_signal_log, output_f, indent=4)
    log_msg = f"clean_trade_signal_log successfully exported to {output_path}."
    logger.register_log(log_msg, logger_dir, log_filename)

    return mentioned_article_sentiment_dict
def collect_company_mention_stats(task_dir,
                                  text_folder='articles',
                                  market_data_folder='market_data',
                                  LUT_filename='company_market_LUT',
                                  log_filename='WSJ_data_cleaner_util_log'):
    logger_dir = task_dir + 'logs/'
    log_filename = log_filename + '.txt'

    LUT_dir = task_dir + market_data_folder + '/' + LUT_filename + '.json'
    with open(LUT_dir, 'r') as LUT_f:
        LUT_data = json.load(LUT_f)

    for v in LUT_data.values():
        v['mentioned_in'] = dict()

    article_storage_dir = task_dir + text_folder + '/'
    article_content_folders = os.listdir(article_storage_dir)
    article_content_folders = [
        i for i in article_content_folders
        if os.stat(article_storage_dir + i).st_size != 0
    ]

    for an_article_date_folder in article_content_folders:
        article_list = os.listdir(article_storage_dir + an_article_date_folder)

        for an_article in article_list:
            article_full_dir = article_storage_dir + an_article_date_folder + '/'
            an_article_file_path = article_full_dir + an_article
            with open(an_article_file_path, 'r') as article_f:
                article_json = json.load(article_f)
                article_json['mention'] = dict()

            for a_company in LUT_data.keys():
                a_company_mentioned_feq = str(article_json['content']).count(
                    str(a_company))
                a_company_mentioned_deduction = 0
                for letter in string.ascii_lowercase:
                    a_company_mentioned_deduction += str(
                        article_json['content']).count(
                            str(a_company) + letter)
                    a_company_mentioned_deduction += str(
                        article_json['content']).count(letter + str(a_company))

                # content = str(article_json['content']).lower()
                # content_no_punctu = content.translate(str.maketrans('', '', string.punctuation))

                a_company_mentioned_feq -= a_company_mentioned_deduction

                if a_company_mentioned_feq > 0:
                    article_json['mention'][
                        a_company] = a_company_mentioned_feq
                    # LUT_data[a_company]['mentioned_in'][article_json['article_id']] = LUT_data[a_company]['mentioned_in'][article_json['article_id']] + a_company_mentioned_feq
                    LUT_data[a_company]['mentioned_in'][
                        article_json['article_id']] = {
                            'mentioned_time':
                            article_json['mention'][a_company],
                            'date': article_json['date']
                        }

                    log_msg = f"Registered {an_article_file_path} mentions of {a_company} for {article_json['mention'][a_company]} times."
                    logger.register_log(log_msg, logger_dir, log_filename)
                else:
                    unfound_msg = f"Found {an_article_file_path} has no ({a_company_mentioned_feq}) mention of {a_company}."
                    # logger.register_log(unfound_msg, logger_dir, log_filename)
                    continue

            with open(an_article_file_path, 'w+') as article_f_n:
                json.dump(article_json, article_f_n, indent=4)

            log_msg = f"Rewrite {an_article_file_path} with updated mention data."
            logger.register_log(log_msg, logger_dir, log_filename)

    with open(LUT_dir, 'w+') as LUT_f_n:
        json.dump(LUT_data, LUT_f_n, indent=4)

    log_msg = f"Rewrite {LUT_dir} with updated mentioned_in data."
    logger.register_log(log_msg, logger_dir, log_filename)
Пример #10
0
def get_articles(driver, output_dir):
    url_storage_dir_prefix = output_dir + 'article_urls/'
    article_storage_dir_prefix = output_dir + 'articles/'
    logger_dir = output_dir + 'logs/'
    log_filename = 'articles_log.txt'

    if not os.path.exists(article_storage_dir_prefix):
        os.makedirs(article_storage_dir_prefix)

    url_file_list = os.listdir(url_storage_dir_prefix)
    url_file_list = [
        i for i in url_file_list
        if os.stat(url_storage_dir_prefix + i).st_size != 0
    ]
    url_file_list = sorted(url_file_list)

    article_url_prefix = 'https://www.wsj.com/articles/'

    for an_url_file in url_file_list:
        a_date = an_url_file[:-4]
        article_storage_dir = article_storage_dir_prefix + '/' + a_date + '/'
        if not os.path.exists(article_storage_dir):
            os.makedirs(article_storage_dir)

        with open(url_storage_dir_prefix + an_url_file) as url_f:
            article_url_list = [i for i in url_f]

        url_counter = 0
        for an_article_url_suffix in article_url_list:
            an_article_url_suffix = an_article_url_suffix.strip()
            an_article_url = article_url_prefix + an_article_url_suffix
            driver.get(an_article_url)
            soup = BeautifulSoup(driver.page_source, 'lxml')

            json_output = {
                'channel': 'WSJ',
                'date': a_date,
                'url': an_article_url
            }

            quote_tags = soup.find_all('a', href=True)
            quote_list = []
            for i in quote_tags:
                url = i['href']
                text = i.text
                if 'market-data/quotes/' in url and len(
                        url.split('/')) == 6 and '?mod=' not in url:
                    quote_list.append((text, url))

            author_tag = soup.find("span", {"class": "author-name"})
            headline_tag = soup.find("h1", {"class": "wsj-article-headline"})

            author = None
            headline = None
            try:
                author = author_tag.get_text().strip()
                headline = headline_tag.get_text().strip()
            except AttributeError as e:
                error_log = f"{a_date}'s {an_article_url} failed author: '{author}' or headline: '{headline}' retrieval due to {e}."
                logger.register_log(error_log, logger_dir, log_filename)

            content_tag = soup.find("div", {"class": "article-content"})
            try:
                content = content_tag.get_text().strip()
            except AttributeError as e:
                error_log = f"{a_date}'s {an_article_url} failed during content retrieval due to {e} (content tag)."
                logger.register_log(error_log, logger_dir, log_filename)
                continue

            json_output['author'] = author
            json_output['headline'] = headline
            json_output['quotes'] = quote_list
            json_output['content'] = content

            # with open(article_storage_dir + an_article_url_suffix + '-quotes.txt', 'w+') as quote_f:
            #     quote_f.write('\n'.join(quote_list))
            with open(article_storage_dir + an_article_url_suffix + '.json',
                      'w+') as output_f:
                # content_f.write(content)
                json.dump(json_output, output_f, indent=4)

            url_counter += 1
            output_log = f"{a_date}'s {an_article_url_suffix} done (quotes: {len(quote_list)}; content: {len(content)}) | #{url_counter}/{len(article_url_list)}"
            logger.register_log(output_log, logger_dir, log_filename)