-
Notifications
You must be signed in to change notification settings - Fork 0
/
Bloomberg With Sentiment.py
146 lines (137 loc) · 6.34 KB
/
Bloomberg With Sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 25 14:57:08 2018
@author: Varun.Londhe
"""
import os
import errno
import re
import statistics as st
from urllib import parse
import pytz # new import
from dateutil.parser import parse as p
import pandas as pd
from bs4 import BeautifulSoup
import requests
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import PunktSentenceTokenizer
import Constants
def bloomberg_scrapper(url):
"""Returns the Date, Source, Heading and Article content from the URL passed."""
all_contents = []
try:
web_page = requests.get(url)
web_page.raise_for_status()
except requests.exceptions.HTTPError as err:
print(err)
pass
bloomberg_soup = BeautifulSoup(web_page.content, "lxml")
try:
extract_date = bloomberg_soup.find_all(Constants.NO_SCRIPT)
test = bloomberg_soup.find(Constants.DIV, \
{Constants.CLASS: "transporter-item current"}).find_all("p")
article = Constants.EMPYT_STRING
for content in test:
article = article + content.text
source = parse.urlparse(url).hostname.split(".")[Constants.INDEX_1]
old_timezone = pytz.timezone(Constants.TIME_ZONE_EDT)
new_timezone = pytz.timezone(Constants.TIME_ZONE_IST)
my_timestamp = p(extract_date[1].text)
my_timestamp_in_new_timezone = old_timezone.localize(my_timestamp).astimezone(new_timezone)
all_contents.append(my_timestamp_in_new_timezone.strftime(Constants.DATE_FORMAT))
all_contents.append(source)
all_contents.append(bloomberg_soup.title.text)
all_contents.append(article)
except AttributeError as err:
pass
return all_contents
# *************************************************************************************************
FILE = open("C:/Users/varun.londhe/Documents/Python Practice/Reuteurs/Company Repository.txt", "r")
READCOMPANY = FILE.read().splitlines()
READCOMPANY
FILE.close()
df = pd.DataFrame()
LABELS = ["Company", "Date", "Source", "URL", "Heading", "Content"]
for company in READCOMPANY:
COUNTER = 1
while COUNTER != None:
searchUrl = Constants.BLOOMBERG_HEADLINK + company + Constants.BLOOMBERG_TAILLINK \
+ str(COUNTER)
try:
searchResponse = requests.get(searchUrl)
searchResponse.raise_for_status()
except requests.exceptions.HTTPError as err:
print(err)
pass
bloomberg_search_soup = BeautifulSoup(searchResponse.content, "lxml")
findCompany = bloomberg_search_soup.find(Constants.INPUT, \
{Constants.CLASS: "settings-search-box__input"})\
.get('value')
try:
findNext = bloomberg_search_soup.find(Constants.A, \
{Constants.CLASS: "content-next-link"}).text
COUNTER += 1
except AttributeError:
COUNTER = None
print("The counter is now none for ", findCompany)
findIndividualSearch = bloomberg_search_soup.find_all(Constants.H1, \
{Constants.CLASS:\
"search-result-story__headline"})
for ind in findIndividualSearch:
article_url = ind.find(Constants.A, {Constants.HREF: re.compile("/")}).\
get(Constants.HREF)
alldetails = bloomberg_scrapper(article_url)
if alldetails and company in alldetails[3]:
alldetails.insert(0, findCompany)
alldetails.insert(3, article_url)
df = pd.concat([df, pd.DataFrame([alldetails], columns=LABELS)])
else:
continue
# *************************************************************************************************
def find_whole_word(w):
"""A REGEX to find out the location of metadata about the article"""
return re.compile(r'({0})'.format(w), flags=re.IGNORECASE).search
POLARITY_TEXTBLOB = []
SUBJECTIVITY = []
POLARITY_VADER = []
POLARITY_ARTICLE = []
TEXTBLOB_FULL_ARTICLE = []
for news in df["Content"]:
VADER_ARTICLE_COMPOUND = []
TEXTBLOB_ARTICLE_POLARITY = []
TEXTBLOB_ARTICLE_SUBJECTIVITY = []
try:
a = find_whole_word('/Bloomberg')(news).span()[1]
# b = find_whole_word('Reporting by')(news).span()[0]
sentences = PunktSentenceTokenizer().tokenize(news[a + 1: ])
except:
sentences = PunktSentenceTokenizer().tokenize(news)
for sentence in sentences:
vaderAnalyzer = SentimentIntensityAnalyzer()
vs = vaderAnalyzer.polarity_scores(sentence)
textBlobAnalyzer = TextBlob(sentence)
VADER_ARTICLE_COMPOUND.append(vs["compound"])
TEXTBLOB_ARTICLE_POLARITY.append(textBlobAnalyzer.sentiment.polarity)
TEXTBLOB_ARTICLE_SUBJECTIVITY.append(textBlobAnalyzer.sentiment.subjectivity)
POLARITY_TEXTBLOB.append(st.mean(TEXTBLOB_ARTICLE_POLARITY))
SUBJECTIVITY.append(st.mean(TEXTBLOB_ARTICLE_SUBJECTIVITY))
POLARITY_VADER.append(st.mean(VADER_ARTICLE_COMPOUND))
TEXTBLOB_FULL_ARTICLE.append(TextBlob(news).sentiment.polarity)
POLARITY_ARTICLE.append(SentimentIntensityAnalyzer().polarity_scores(news)["compound"])
df["Polarity TextBlob"] = pd.Series(POLARITY_TEXTBLOB, index=df.index)
df["Polarity TextBlob Full Article"] = pd.Series(TEXTBLOB_FULL_ARTICLE, index=df.index)
df["Subjectivity TextBlob"] = pd.Series(SUBJECTIVITY, index=df.index)
df["Polarity Vader"] = pd.Series(POLARITY_VADER, index=df.index)
df["Vader article"] = pd.Series(POLARITY_ARTICLE, index=df.index)
FILENAME = "C:/Users/varun.londhe/Documents/Python Practice/Bloomberg/"
if not os.path.exists(os.path.dirname(FILENAME)):
try:
os.makedirs(os.path.dirname(FILENAME))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
df = df.drop_duplicates(['URL'], keep='first')
df = df.drop_duplicates(['Content'], keep='last')
df.to_csv("C:/Users/varun.londhe/Documents/Python Practice/Bloomberg/TimeStampModified.csv", \
encoding='utf-8-sig', index=False)