def sa_hiv4(txt):
    hiv4 = ps.HIV4()
    tokens = hiv4.tokenize(txt)  # text can be tokenized by other ways
    # however, dict in HIV4 is preprocessed
    # by the default tokenizer in the library
    score = hiv4.get_score(tokens)
    print(f"{txt[:50]}...{txt[-50:]}\n{score}")
    return score
示例#2
0
def get_score_HIV4(html):
    """
	Uses the HIV4 dictionary for sentiment analysis
	"""
    hiv4 = ps.HIV4()
    tokens = hiv4.tokenize(html)
    score = hiv4.get_score(tokens)
    return score
示例#3
0
    def mapper_init(self):
        sys.stderr.write("Initializing module...\n")
        self.hiv4 = ps.HIV4()
        sys.stderr.write("--> Initialized sentimentor\n")

        # Get S3 connection
        conn = boto.connect_s3(AWS_ACCESS, AWS_SECRET)
        self.bucket = conn.get_bucket(BUCKET)
        sys.stderr.write("--> Created AWS connection\n")
示例#4
0
def get_twitter_sentiment(ticker, company, auth_keys):
    # fetch oath tokens from config.ini to secure them

    ckey = auth_keys.get('ckey')
    csecret = auth_keys.get('csecret')
    atoken = auth_keys.get('atoken')
    asecret = auth_keys.get('asecret')

    auth = tweepy.OAuthHandler(ckey, csecret)
    auth.set_access_token(atoken, asecret)
    api = tweepy.API(auth)

    hiv4 = ps.HIV4()
    ss = '!!'  # cleaning mark
    t = ""
    s = ''
    d = datetime.today()
    d7 = d - timedelta(days=7)
    d = d.strftime("%Y-%m-%d")
    d7 = d7.strftime("%Y-%m-%d")
    try:
        for tweet in tweepy.Cursor(api.search,
                                   q=company,
                                   since=str(d7),
                                   until=str(d),
                                   lang="en").items():
            s = ss + tweet.text
            # cleaning the tweets
            s = s.replace(ss + 'RT ', '')
            result = re.sub(r"http\S+", "", s)
            # http  matches literal characters
            # \S+ matches all non-whitespace characters (the end of the url)

            t = t + result + '\n'
    except Exception as e:
        time.sleep(10)

        try:
            for tweet in tweepy.Cursor(api.search,
                                       q=ticker,
                                       since=d7,
                                       until=d,
                                       lang="en").items():
                s = ss + tweet.text
                # cleaning the tweets
                s = s.replace(ss + 'RT ', '')
                result = re.sub(r"http\S+", "", s)
                # http  matches literal characters
                # \S+ matches all non-whitespace characters (the end of the url)

                t = t + result + '\n'
        except Exception as e:
            time.sleep(10)

    tokens = hiv4.tokenize(t)
    score = hiv4.get_score(tokens)
    return score
 def getallsentiment(self, tweets):
     scores = []
     for tweet in tweets:
         analyizer = ps.HIV4()
         tokens = analyizer.tokenize(tweet.text)
         score = analyizer.get_score(tokens)
         if score['Polarity'] != 0:
             scores += [score]
     return scores
示例#6
0
def RunSentimentAnalysis():

    hiv4 = ps.HIV4()

    db = Database()

    sql = u"select id from property;"

    propertyIds = db.runSqlQueryColumn(sql)

    for propertyId in propertyIds:
        property = Property(db, propertyId)
        property.analyseSentiment(hiv4)
示例#7
0
def get_news_sentiment(ticker):
    ticker = ticker.upper()
    hiv4 = ps.HIV4()
    # -- Setup
    options = Options()
    options.add_argument("--headless")
    browser = webdriver.Firefox(firefox_options=options)
    # -- Parse
    #browser.get("https://seekingalpha.com/symbol/" + ticker + "/analysis-and-news?analysis_tab=focus&news_tab=news-all")
    browser.get("https://simulationstock.000webhostapp.com/MSFT.html")
    soup = BeautifulSoup(browser.page_source, "html5lib")
    x = ''
    for div_tag in soup.find_all(
            'div', attrs={"class": "mc_list_texting right bullets"}, limit=7):
        for span_tag in div_tag.find_all(
                'span', attrs={"class": "general_summary light_text bullets"}):
            x = x + span_tag.text
    print(x)
    tokens = hiv4.tokenize(x)
    score = hiv4.get_score(tokens)
    browser.close()
    return score
dictshow = SequenceSelection(dictionary=dict2, length=7, startindex=0)

# Plot most frequent words
n = range(len(dictshow))
plt.bar(n, dictshow.values(), align='center')
plt.xticks(n, dictshow.keys())
plt.title("Most frequent Words")
plt.savefig("FrequentWords.png", transparent=True)

# Overview
overview = SequenceSelection(dictionary=dict2, length=400, startindex=0)
nOverview = range(len(overview.keys()))
plt.bar(nOverview, overview.values(), color="g", tick_label="")
plt.title("Word Frequency Overview")
plt.xticks([])
#plt.savefig("overview.png")
plt.savefig("overview.png", transparent=True)
#plt.savefig('overview.png', transparent=True)

# Sentiment Analysis
hiv4 = ps.HIV4()
tokens = hiv4.tokenize(cleantext)
score = hiv4.get_score(tokens)
print(score)

# Polarity
# Formula: (Positive - Negative)/(Positive + Negative)

# Subjectivity
# Formula: (Positive + Negative)/N
def sentiment(text):
    hiv4 = ps.HIV4()
    score = hiv4.get_score(text)
    return score
示例#10
0
    def mapper_init(self):
        sys.stderr.write("Initializing module...\n")
        self.hiv4 = ps.HIV4()
	sys.stderr.write("--> Initialized sentimentor\n")
示例#11
0
    def func():
        feed = feedparser.parse(
            "https://news.google.com/news/rss/headlines/section/topic/BUSINESS.en_pk/Business?ned=en_pk&hl=en&gl=PK"
        )
        # feed = feedparser.parse("http://feeds.feedburner.com/com/Yeor")
        wnl = WordNetLemmatizer()
        # feed_title = feed['feed']['title']
        # feed_entries = feed.entries

        for entry in feed.entries:
            article_title = (entry.title).encode("utf-8")
            article_link = (entry.link).encode("utf-8")
            article_published_at = entry.published
            article_title = article_title.lower()
            print article_title, article_link, article_published_at

            #### Title processing
            # title = [wnl.lemmatize(i, j[0].lower()) if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i) for i, j in
            #  pos_tag(word_tokenize(article_title))]
            #
            # no_unicode = []
            #
            # for items in title:
            #     no_unicode.append(items.encode("utf-8"))
            # for words in title:
            #      no_unicode += " " + words.encode("utf-8")

            # print "Lemmatized"
            # print no_unicode

            if article_title not in titles:
                # titles.append(article_title)
                # urls.append(article_link)
                titles.append(article_title)
                titles_urls.append(tuple([article_title, article_link]))
                extractor = Goose()
                article = extractor.extract(url=article_link)
                text = article.cleaned_text
                text = text.lower()
                stop_words = set(stopwords.words('english'))

                word_tokens = word_tokenize(text)

                filtered_sentence = [
                    w for w in word_tokens if not w in stop_words
                ]

                for words in filtered_sentence:
                    # filtered_sentence = words.encode('ascii', 'ignore')
                    filtered_sentence = words.encode("utf-8")

                filtered_sentence = str(filtered_sentence)

                for w in word_tokens:
                    if w not in stop_words:
                        # filtered_sentence.append(w)
                        filtered_sentence = filtered_sentence.encode("utf-8")
                        filtered_sentence += " " + w

                # words = set(nltk.corpus.words.words())
                filtered_sentence = re.sub(ur"[^\w\d\s]+", '',
                                           filtered_sentence)

                print "Non-lemmatized News:"
                print filtered_sentence

                filtered_sentence = [
                    wnl.lemmatize(i, j[0].lower())
                    if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i)
                    for i, j in pos_tag(word_tokenize(filtered_sentence))
                ]

                no_unicode = []

                for items in filtered_sentence:
                    no_unicode.append(items.encode("utf-8"))
                # for words in filtered_sentence:
                #      no_unicode += " " + words.encode("utf-8")

                hiv4 = ps.HIV4()
                # words = set(nltk.corpus.words.words())
                # all_clean = filtered_sentence
                all_clean = " ".join(x for x in no_unicode)
                # cleaned = " ".join(w for w in nltk.wordpunct_tokenize(all_clean) \
                #                   if w.lower() in words or not w.isalpha())

                # print all_clean

                general = [
                    " kse ", " psx ", "pakistan stock exchange",
                    "karachi stock exchange", "pakistan stock market"
                ]
                keywords_pso = [" pso ", "pakistan state oil"]
                keywords_engro = [" engro ", "engro fertilizer"]
                keywords_hbl = [" hbl ", "habib bank"]
                keywords_ubl = [" ubl ", "unite bank"]
                keywords_ogdcl = [
                    " ogdc ", " ogdcl ", "oil & gas development company",
                    "oil and gas development company"
                ]

                flag = 0
                # relevant = []

                for words in general:
                    if words in all_clean:
                        flag = 1
                        # relevant.append(article_title)

                for words in keywords_pso:
                    if words in all_clean:
                        flag = 2
                        # relevant.append(article_title)

                for words in keywords_hbl:
                    if words in all_clean:
                        flag = 3
                        # relevant.append(article_title)

                for words in keywords_ogdcl:
                    if words in all_clean:
                        flag = 4
                        # relevant.append(article_title)

                for words in keywords_ubl:
                    if words in all_clean:
                        flag = 5

                for words in keywords_engro:
                    if words in all_clean:
                        flag = 6
                        # relevant.append(article_title)

                if flag == 1:
                    print "General Stock Market News"
                    score = hiv4.get_score(all_clean.split())
                    print "Cleaned News Below:"
                    print all_clean
                    print "Score Below:"
                    print score

                    scores[article_title] = score['Polarity']

                if flag == 2:
                    print "News related to PSO"
                    score = hiv4.get_score(all_clean.split())
                    print "Cleaned News Below:"
                    print all_clean
                    print "Score Below:"
                    print score

                    scores[article_title] = score['Polarity']

                if flag == 3:
                    print "News related to HBL"
                    score = hiv4.get_score(all_clean.split())
                    print "Cleaned News Below:"
                    print all_clean
                    print "Score Below:"
                    print score

                    scores[article_title] = score['Polarity']

                if flag == 4:
                    print "News related to OGDCL"
                    score = hiv4.get_score(all_clean.split())
                    print "Cleaned News Below:"
                    print all_clean
                    print "Score Below:"
                    print score

                    scores[article_title] = score['Polarity']

                if flag == 5:
                    print "News related to UBL"
                    score = hiv4.get_score(all_clean.split())
                    print "Cleaned News Below:"
                    print all_clean
                    print "Score Below:"
                    print score

                    scores[article_title] = score['Polarity']

                if flag == 6:
                    print "News related to ENGRO"
                    score = hiv4.get_score(all_clean.split())
                    print "Cleaned and Lemmatized News Below:"
                    print all_clean
                    print "Score Dictionary Below:"
                    print score

                    scores[article_title] = score['Polarity']

                if flag == 0:
                    print "News was irrelevant!"

                print titles
                print scores
                print "\n"

            else:
                print "Already Processed!\n"

        print "Exiting...\n"
示例#12
0
# New Method

# In[16]:

import pysentiment as ps
from collections import Counter
import re
import random
from operator import truediv
import matplotlib.pyplot as plt
import operator
import pandas as pd
import datetime
#lm = ps.LM()
lm = ps.HIV4()


def generateDate(initialYear, initialMonth, initialDay, endY, endM, endD):
    initial = datetime.datetime(initialYear, initialMonth, initialDay)
    duration = datetime.datetime(endY, endM, endD) - datetime.datetime(
        initialYear, initialMonth, initialDay)
    duration = duration.days
    initial = datetime.datetime(initialYear, initialMonth, initialDay)
    dates = [initial]
    for i in range(1, duration + 1):
        dates.append((initial + datetime.timedelta(days=i)))
    #print(dates)
    return dates

 def __init__(self):
     self.stocks = {}
     self.sectors = {}
     self.user_classification = {}
     self.dict = ps.HIV4()
def sentiment(text):
	hiv4 = ps.HIV4()
	tokens = hiv4.tokenize(text.decode('utf-8'))
	score = hiv4.get_score(tokens)
	print (score)
示例#15
0
 def do_sentiment_scores(self, text):
     hiv4 = pysentiment.HIV4()
     tokens = hiv4.tokenize(text)
     score = hiv4.get_score(tokens)
     return (tokens, score)