def main(csv_file, txt_file, amount, name):
    with open(txt_file) as input_file:
        data = pd.read_csv(csv_file)
        rand_choices = random_sample(len(data), amount)
        new_df = data.loc[[i for i in rand_choices]]
        df = pd.DataFrame(new_df[new_df.columns[1:5]])
        df["vader_score_neg"] = ""
        df["vader_score_neu"] = ""
        df["vader_score_pos"] = ""
        df["vader_score_compound"] = ""
        start = [item for item in data[data.columns[2]]]
        end = [item for item in data[data.columns[3]]]
        articles = []
        container = list(input_file)
        for i, j in zip(start, end):
            articles.append(''.join(container[i:j]))
        for i in df.index.values:
            df.loc[i, 'vader_score_neg'] = vaderSentiment.sentiment(
                articles[i])['neg']
            df.loc[i, 'vader_score_neu'] = vaderSentiment.sentiment(
                articles[i])['neu']
            df.loc[i, 'vader_score_pos'] = vaderSentiment.sentiment(
                articles[i])['pos']
            df.loc[i, 'vader_score_compound'] = vaderSentiment.sentiment(
                articles[i])['compound']
        df.to_csv(name + '_sentiment.csv')
def get_bigram_sentiment_distribution(bigrams):
    """
	Send the number of ([positive, neutral, negative], [positive, neutral, negative]) sentiment bigrams
	"""
    from vaderSentiment.vaderSentiment import sentiment
    from operator import itemgetter

    pos_pos = 0
    pos_neu = 0
    pos_neg = 0
    neu_pos = 0
    neu_neu = 0
    neu_neg = 0
    neg_pos = 0
    neg_neu = 0
    neg_neg = 0

    for tup in bigrams:
        word_one, word_two = tup
        vs_one = sentiment(word_one)
        del vs_one['compound']
        sent_one = sorted(vs_one.items(), key=itemgetter(1),
                          reverse=True)[0][0]
        vs_two = sentiment(word_two)
        del vs_two['compound']
        sent_two = sorted(vs_two.items(), key=itemgetter(1),
                          reverse=True)[0][0]

        if sent_one == 'pos':
            if sent_two == 'pos':
                pos_pos += 1
            elif sent_two == 'neu':
                pos_neu += 1
            else:
                pos_neg += 1
        elif sent_one == 'neu':
            if sent_two == 'pos':
                neu_pos += 1
            elif sent_two == 'neu':
                neu_neu += 1
            else:
                neu_neg += 1
        else:
            if sent_two == 'pos':
                neg_pos += 1
            elif sent_two == 'neu':
                neg_neu += 1
            else:
                neg_neg += 1

    return (pos_pos, pos_neu, pos_neg, neu_pos, neu_neu, neu_neg, neg_pos,
            neg_neu, neg_neg)
Exemplo n.º 3
0
def add_sentiment_comment_features():
    """
	"""
    from pymongo import MongoClient
    from bson.objectid import ObjectId
    from tqdm import tqdm
    from vaderSentiment.vaderSentiment import sentiment
    from RMP_words_funk import strip_punctuation
    from pdb import set_trace

    set_trace()

    rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb']
    ds_profs_ten_over_cur = rmpdb['dataset_profs_five_over_less_ten'].find(
        {}, {'prof_id': 1}, no_cursor_timeout=True)

    for row in tqdm(ds_profs_ten_over_cur):
        id_string = row['prof_id']
        prof_comments = rmpdb['profs'].find_one({'_id': ObjectId(id_string)}, {
            '_id': 0,
            'all comments.rComments': 1
        })
        tokens = list()
        for comment in prof_comments['all comments']:
            tokens.extend(strip_punctuation(comment['rComments']).split())

        insertion_dict = {'44': 0, '45': 0, '46': 0, '47': 0}

        for tok in tokens:
            vs = sentiment(tok.encode('utf-8'))
            if vs['pos'] > vs['neg']:
                insertion_dict['44'] += 1
            elif vs['neg'] > vs['pos']:
                insertion_dict['45'] += 1
            else:
                continue

        for comment in prof_comments['all comments']:
            vs = sentiment(comment['rComments'].encode('utf-8'))
            if vs['pos'] > vs['neg']:
                insertion_dict['46'] += 1
            elif vs['neg'] > vs['pos']:
                insertion_dict['47'] += 1
            else:
                continue

        result = rmpdb['dataset_profs_five_over_less_ten'].update_one(
            {'_id': row['_id']}, {'$set': insertion_dict})
Exemplo n.º 4
0
def score():
    phrase = request.args.get('phrase')

    if phrase:
        return jsonify(sentiment(phrase))
    else:
        return jsonify({"error": "Phrase cannot be empty"})
Exemplo n.º 5
0
    def convert_to_rating(self, text):

        sentences = sent_tokenize(text)
        pos = 0
        neg = 0
        neu = 0

        #Finds number of positive, negative and neutral sentences
        for sentence in sentences:
            vs = sentiment(sentence)

            if vs['neg'] > 0.0 and vs['compound'] < 0.0:
                neg = neg + 1
            elif vs['neg'] > 0.0 and vs['compound'] > 0.0:
                if vs['pos'] > 0.0 and vs['neu'] > 0.0 and vs['compound'] > 0.5:
                    pos = pos + 1
                elif vs['pos'] == 0.0 and vs['neu'] > 0.0:
                    neu = neu + 1
            elif vs['neg'] == 0.0 and vs['compound'] > 0.0:
                if abs(vs['neu'] - vs['pos']) <= 0.3:
                    pos = pos + 1
                else:
                    neu = neu + 1
            else:
                neu = neu + 1

        #Obtains total number of positive and negative sentences
        total = pos + neg
        if total == 0:
            total = 1

        #Formula to convert text to rating on a scale of [1:5]
        rating = ((float(pos) / (total)) * 4.0) + 1

        return rating
def add_sentiment_comment_features():
	"""
	"""
	from pymongo import MongoClient
	from bson.objectid import ObjectId
	from tqdm import tqdm
	from vaderSentiment.vaderSentiment import sentiment
	from RMP_words_funk import strip_punctuation
	from pdb import set_trace

	set_trace()

	rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb']
	ds_profs_ten_over_cur = rmpdb['dataset_profs_five_over_less_ten'].find({},{'prof_id' : 1}, no_cursor_timeout = True)

	for row in tqdm(ds_profs_ten_over_cur):
		id_string = row['prof_id']
		prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(id_string)}, {'_id' : 0, 'all comments.rComments' : 1})
		tokens = list()
		for comment in prof_comments['all comments']:
			tokens.extend(strip_punctuation(comment['rComments']).split())

		insertion_dict = {	'44' : 0,
							'45' : 0,
							'46' : 0,
							'47' : 0}

		for tok in tokens:
			vs = sentiment(tok.encode('utf-8'))
			if vs['pos'] > vs['neg']:
				insertion_dict['44'] += 1
			elif vs['neg'] > vs['pos']:
				insertion_dict['45'] += 1
			else:
				continue

		for comment in prof_comments['all comments']:
			vs = sentiment(comment['rComments'].encode('utf-8'))
			if vs['pos'] > vs['neg']:
				insertion_dict['46'] += 1
			elif vs['neg'] > vs['pos']:
				insertion_dict['47'] += 1
			else:
				continue
		
		result = rmpdb['dataset_profs_five_over_less_ten'].update_one({'_id' : row['_id']}, {'$set' :insertion_dict})
Exemplo n.º 7
0
def SaSentimentRSS(symbol):
    url = "http://seekingalpha.com/symbol/" + symbol + ".xml"
    url2 = "http://feeds.finance.yahoo.com/rss/2.0/headline?s=" + symbol + "&region=US&lang=en-US"
    url3 = "http://www.google.ca/finance/company_news?q=" + symbol + "&output=rss"
    # gets list of links from above RSS feed
    NewsURLs = getSaURL(url)
    NewsURLs += RSS_URL.getURLs2(url2)
    NewsURLs += RSS_URL.getURLs2(url3)

    # String to be written to file
    toBeWrittenToFile = ''

    for link in NewsURLs:
        try:
            # gets article portion of the htmltext
            a = Article(link)
            a.download()
            a.parse()

            # not working if it's RSS title link or has no title or cannot be accessed
            if symbol in a.title and not 'Earnings Call Webcast' in a.title and not 'Stock Market Insights' in a.title and not '400 Bad Request' in a.title and not '403 Forbidden' in a.title and a.title != '':
                UnicodeArticle = a.text
                StringArticle = UnicodeArticle.encode('ascii', 'ignore')
                StrippedArticle = StringArticle.replace('\n', '')

                # not working with articles less than 300 words
                if len(StrippedArticle) > 200:

                    # remove ascii symbols
                    ArticleTitle = a.title.encode('ascii', 'ignore').replace(',', '')

                    # filters out irrelevant articles
                    if 'Transcript' not in ArticleTitle and 'Summary' not in ArticleTitle:

                        # writes sentiment from sentiment API to file
                        # locks this block so that only one thread can write to file at a time

                        # vader sentiment dictionary
                        s = vaderSentiment.sentiment(StrippedArticle)

                        # not writing articles with zero sentiments
                        # collect a string to be written to file
                        if s['compound'] != 0:
                            # print ArticleTitle
                            toBeWrittenToFile += (
                                str(symbol) + ',' + str(s['neg']) + ',' + str(s['neu']) + ',' + str(s['pos']) + ',' + str(
                                    s['compound']) + ',' + ArticleTitle + ',' + str(link) + '\n')

        except Exception as ex:
            template = "An exception of type {0} occured. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print message
    # write variable to file
    lock.acquire()
    try:
        myfile.write(toBeWrittenToFile)
    finally:
        lock.release()
Exemplo n.º 8
0
def checkText(text):
    analyse = vaderSentiment.sentiment(text)
    compound = analyse["compound"]
    
    if compound < -0.5:
        return -1
    elif compound > 0.5:
        return 1
    else:
        return 0
Exemplo n.º 9
0
def topNTweets(tweets, n, posThresh):
    posScores = np.zeros(len(tweets) + 1)
    negScores = np.zeros(len(tweets) + 1)
    tweetTxt = [0] * (len(tweets) + 1)
    i = 0

    for t in tweets:
        posScores[i] = sentiment(tweets[i].id.encode("utf-8"))['pos']
        negScores[i] = sentiment(tweets[i].id.encode("utf-8"))['neg']
        tweetTxt[i] = tweets[i].id
        i += 1

    posScores = posScores[posScores > posThresh]
    negScores = negScores[negScores < 0.2]

    topNIndices = np.argsort(posScores)[::-1][:n]
    #    topNIndices = np.argsort(negScores)[0:n-1]

    topN = np.array(tweetTxt)[topNIndices]

    return topN
Exemplo n.º 10
0
def SentimentRSS(symbol):

    url = "http://feeds.finance.yahoo.com/rss/2.0/headline?s=" + symbol + "&region=US&lang=en-US"
    #gets list of links from above RSS feed
    NewsURLs = RSS_URL.getURLs2(url)

    #String to be written to file
    toBeWrittenToFile = ''

    for link in NewsURLs:
        try:
            #gets article portion of the htmltext
            a = Article(link)
            a.download()
            a.parse()

            #not working if it's RSS title link or has no title or cannot be accessed
            if not 'Stock - Yahoo! Finance' in a.title and not '400 Bad Request' in a.title and not '403 Forbidden' in a.title and a.title != '':
                UnicodeArticle = a.text
                StringArticle = UnicodeArticle.encode('ascii', 'ignore')
                StrippedArticle = StringArticle.replace('\n', '')

                #remove ascii symbols
                ArticleTitle = a.title.encode('ascii',
                                              'ignore').replace(',', '')

                #writes sentiment from sentiment API to file
                #locks this block so that only one thread can write to file at a time

                #vader sentiment dictionary
                s = vaderSentiment.sentiment(StrippedArticle)

                #not writing articles with zero sentiments
                #collect a string to be written to file
                if s['compound'] != 0:
                    print ArticleTitle
                    toBeWrittenToFile += (str(symbol) + ',' + str(s['neg']) +
                                          ',' + str(s['neu']) + ',' +
                                          str(s['pos']) + ',' +
                                          str(s['compound']) + ',' +
                                          ArticleTitle + ',' + str(link) +
                                          '\n')

        except Exception as ex:
            template = "An exception of type {0} occured. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print message

    lock.acquire()
    try:
        myfile.write(toBeWrittenToFile)
    finally:
        lock.release()
def topNTweets(tweets, n, posThresh):
    posScores = np.zeros(len(tweets)+1)
    negScores = np.zeros(len(tweets)+1)
    tweetTxt = [0]*(len(tweets)+1)
    i = 0
    
    for t in tweets:
        posScores[i] = sentiment(tweets[i].id.encode("utf-8"))['pos']
        negScores[i] = sentiment(tweets[i].id.encode("utf-8"))['neg']
        tweetTxt[i] = tweets[i].id
        i += 1
    
    posScores = posScores[posScores>posThresh]
    negScores = negScores[negScores<0.2]

    topNIndices = np.argsort(posScores)[::-1][:n]
#    topNIndices = np.argsort(negScores)[0:n-1]
    
    topN = np.array(tweetTxt)[topNIndices]
    
    return topN
Exemplo n.º 12
0
def jsondata(sentence):
    #figure out if sentence contains but
    sentlist = sentence.split('but')
    return {
        'sentence':
        sentence,
        'sentiment':
        processSentiment(sentiment(sentence)),
        'data': [{
            'rel':
            'nmod',
            'phrase': [{
                'value':
                t,
                'sentiment':
                processSentiment(sentiment(t)),
                'index': [0, 10],
                'keyword':
                processKeyword(map(lambda x: x[0], rakeobj.run(t)))
            } for t in sentlist]
        }]
    }
def main(csv_file, txt_file, amount, name):
	with open(txt_file) as input_file:
		data = pd.read_csv(csv_file)
		rand_choices = random_sample(len(data), amount)
		new_df = data.loc[[i for i in rand_choices]]
		df = pd.DataFrame(new_df[new_df.columns[1:5]])
		df["vader_score_neg"] = ""
		df["vader_score_neu"] = ""
		df["vader_score_pos"] = ""
		df["vader_score_compound"] = ""
		start = [item for item in data[data.columns[2]]]
		end = [item for item in data[data.columns[3]]]	
		articles = []
		container = list(input_file)
		for i,j in zip(start,end):
			articles.append(''.join(container[i:j]))
		for i in df.index.values:
			df.loc[i, 'vader_score_neg'] = vaderSentiment.sentiment(articles[i])['neg']
			df.loc[i, 'vader_score_neu'] = vaderSentiment.sentiment(articles[i])['neu']
			df.loc[i, 'vader_score_pos'] = vaderSentiment.sentiment(articles[i])['pos']
			df.loc[i, 'vader_score_compound'] = vaderSentiment.sentiment(articles[i])['compound']
		df.to_csv(name+'_sentiment.csv')
def gen_output(data, json_data_dir):

    term, is_reply, tweets_needed = data

    dataset = []

    # get all user files
    files = glob.glob(os.path.join(json_data_dir, "*"))
    random.shuffle(files)

    for f in files:
        user = TwitterUser()
        user.populate_tweets_from_file(f,
                                       store_json=True,
                                       do_arabic_stemming=False,
                                       lemmatize=False)

        if 50 <= user.n_total_tweets <= 10000 and\
           user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

            tweet_set = [t for t in user.tweets if t.retweeted is None and\
                           len(t.urls) == 0 and 'http:' not in t.text and\
                           len(t.tokens) > 5 and\
                           t.created_at >= MIN_TWEET_DATE and\
                           (term == '' or term in t.tokens) and\
                           langid.classify(t.text)[0] == 'en'and\
                           sentiment(t.text)['compound'] != 0]

            if is_reply:
                tweet_set = [t for t in tweet_set if t.reply_to]
            else:
                tweet_set = [t for t in tweet_set if not t.reply_to]

            if len(tweet_set) == 0:
                print 'size 0', term, tweets_needed, is_reply
                continue

            tweet = random.sample(tweet_set, 1)[0]
            print user.screen_name, term, tweets_needed, is_reply, "::::  ", tweet.text
            dataset.append(tweet)
            tweets_needed -= 1
            if tweets_needed == 0:
                name = term if term != '' else 'random'
                name += '_reply' if is_reply else '_non_reply'
                pickle.dump(dataset, open(name + ".p", 'wb'))
                print 'done with: ', name, is_reply
                return

        else:
            print 'failed user'
def scrape_tweets(api, term):
    uppers = set(string.ascii_uppercase)

    all_tweets = tweepy.Cursor(api.search, q=term, count=100,
                               lang='en').items(17900)

    tweets_table = []
    entities_table = []

    for tweet in all_tweets:
        tweet_info = (
            tweet.id,
            tweet.text,
            str(tweet.created_at),
            tweet.place.country if tweet.place else None,
            tweet.favorite_count,
            tweet.retweet_count,
            term,
            sentiment(
                tweet.text.
                replace(  # don't let vaderSentiment see non-neutral words in titles
                    'tsuki no', 'tsuki').replace('TSUKI NO', 'TSUKI').replace(
                        'tsuki No', 'tsuki').replace('Death ', '').replace(
                            'death parade', 'parade').replace(
                                'DEATH PARADE', 'PARADE').replace(
                                    'Assassination ',
                                    '').replace('assassination classroom',
                                                'classroom').replace(
                                                    'ASSASSINATION CLASSROOM',
                                                    'CLASSROOM').
                replace('Cute ', '').replace('cute high', 'high').replace(
                    'CUTE HIGH', 'HIGH').replace('Club LOVE', 'Club').replace(
                        'Club Love', 'Club').replace(
                            'club love', 'club').replace(
                                'CLUB LOVE',
                                'CLUB').encode('utf-8', 'ignore'))['compound'],
            int(  # Check For Jaden Smith-Style Capitalization
                all(x[0] in uppers for x in tweet.text.split()
                    if x[0].isalpha() and '/' not in x)),  # check if retweet
            int(tweet.text.startswith('RT @')))
        extra_info = (tweet.id, [
            x['text'] for x in tweet.entities['hashtags']
        ], [x['screen_name'] for x in tweet.entities['user_mentions']],
                      [x['expanded_url'] for x in tweet.entities['urls']])
        tweets_table.append(tweet_info)
        entities_table.append(extra_info)

    return tweets_table, entities_table
Exemplo n.º 16
0
 def on_data(self, data):
     try:
         data_clone = json.loads(data)
         save_data = {
             "created_at":   data_clone["created_at"],
             "text":         data_clone["text"],
             "user":         data_clone["user"]["screen_name"],
             "followers":    data_clone["user"]["followers_count"],
         }
         print "Created at: %s by @%s\n%s\n" % (save_data["created_at"], save_data["user"], save_data["text"])
         save_data["score"] = sentiment(save_data["text"].encode('utf-8'))
         for line in save_data["score"]:
             print line, save_data["score"][line]
         print ""
     except Exception, e:
         print "Error: %s.\n" % e
Exemplo n.º 17
0
def gen_output(data, json_data_dir):

    term,is_reply,tweets_needed = data

    dataset = []

    # get all user files
    files = glob.glob(os.path.join(json_data_dir,"*"))
    random.shuffle(files)

    for f in files:
        user = TwitterUser()
        user.populate_tweets_from_file(f,store_json=True,do_arabic_stemming=False,lemmatize=False)

        if 50 <= user.n_total_tweets <= 10000 and\
           user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

            tweet_set = [t for t in user.tweets if t.retweeted is None and\
                           len(t.urls) == 0 and 'http:' not in t.text and\
                           len(t.tokens) > 5 and\
                           t.created_at >= MIN_TWEET_DATE and\
                           (term == '' or term in t.tokens) and\
                           langid.classify(t.text)[0] == 'en'and\
                           sentiment(t.text)['compound'] != 0]

            if is_reply:
                tweet_set = [t for t in tweet_set if t.reply_to]
            else:
                tweet_set = [t for t in tweet_set if not t.reply_to]

            if len(tweet_set) == 0:
                print 'size 0', term, tweets_needed, is_reply
                continue

            tweet = random.sample(tweet_set, 1)[0]
            print user.screen_name, term, tweets_needed, is_reply, "::::  ", tweet.text
            dataset.append(tweet)
            tweets_needed -= 1
            if tweets_needed == 0:
                name = term if term != '' else 'random'
                name += '_reply' if is_reply else '_non_reply'
                pickle.dump(dataset,open(name+".p",'wb'))
                print 'done with: ',name, is_reply
                return

        else:
            print 'failed user'
Exemplo n.º 18
0
 def on_data(self, data):
     try:
         data_clone = json.loads(data)
         save_data = {
             "created_at": data_clone["created_at"],
             "text": data_clone["text"],
             "user": data_clone["user"]["screen_name"],
             "followers": data_clone["user"]["followers_count"],
         }
         print "Created at: %s by @%s\n%s\n" % (
             save_data["created_at"], save_data["user"], save_data["text"])
         save_data["score"] = sentiment(save_data["text"].encode('utf-8'))
         for line in save_data["score"]:
             print line, save_data["score"][line]
         print ""
     except Exception, e:
         print "Error: %s.\n" % e
Exemplo n.º 19
0
def get_reviews_by_page(review_link_href, page, reviews, sentiments):

    all_reviews = ''
    overall_sentiment = 0
    review_link_href = review_link_href + '&pageNumber=' + str(page)
    index = (page - 1) * 10

    review_page_data = BeautifulSoup(get_page_by_url(review_link_href), 'html.parser')

    review_container_el = review_page_data.find("div", {"id": "cm_cr-review_list"})
    if not(review_container_el is None):

        for reviews_el in review_container_el.find_all("div", {"class": "review"}):
            if not(reviews_el is None):

                reviews.append({})

                review_data_el = reviews_el.find("span", {"class": "review-text"})
                review_rating_el = reviews_el.find("i", {"class": "review-rating"}).find("span")

                if not(review_data_el is None) and not(review_rating_el is None):

                    reviews_text = get_text(review_data_el)
                    reviews[index]['text'] = reviews_text
                    all_reviews += reviews_text
                    review_sentiment = sentiment(reviews_text)
                    reviews[index]['sentiment'] = review_sentiment

                    review_rating = get_text(review_rating_el)
                    reviews[index]['review_rating'] = review_rating
                    overall_sentiment += review_sentiment['compound']

                    if '1.0' in review_rating:
                        sentiments[0]['data'].append([review_sentiment['compound'], 1.0])
                    if '2.0' in review_rating:
                        sentiments[1]['data'].append([review_sentiment['compound'], 2.0])
                    if '3.0' in review_rating:
                        sentiments[2]['data'].append([review_sentiment['compound'], 3.0])
                    if '4.0' in review_rating:
                        sentiments[3]['data'].append([review_sentiment['compound'], 4.0])
                    if '5.0' in review_rating:
                        sentiments[4]['data'].append([review_sentiment['compound'], 5.0])

                    index += 1

    return all_reviews, overall_sentiment
def topNTweets(tweets, n = 100, posThresh = 0.7):
    posScores = np.zeros(len(tweets)+1)
    negScores = np.zeros(len(tweets)+1)
    tweetID = [0]*(len(tweets)+1)
    i = 0
    for t in tweets:
        #if not t.in_reply_to_status_id:
            #continue
        sent = sentiment(t.text.encode("utf-8"))
        if sent['neu'] == 1.0:
            continue
        posScores[i] = sent['pos']/(sent['pos']+sent['neg'])
        tweetID[i] = t.id
        i += 1

    posScores = posScores[posScores>posThresh]
    topNIndices = np.argsort(posScores)[::-1][:n]
    topN = np.array(tweetID)[topNIndices]
    return topN
Exemplo n.º 21
0
def prepare_news_data_vader(news):
    '''
    Get sentiment scores for all 25 headlines on each day
    Input: raw news table
    Output: rows = # unique dates, cols = 25 (one for each headline)
    '''

    news = news.copy()

    #sort by date and assign top id to each item
    news['News'] = news['News'].apply(lambda x: x.lower().replace(
        ',', '').replace('.', ''))  #convert to lower case and some trimming
    news.sort('Date', ascending=True, inplace=True)
    news['sentiment_score'] = news['News'].apply(
        lambda x: sentiment(x)['compound'])

    #rank index for each day
    news_np = np.array(news)  #each row is [Date, news headline, score]
    news_np_score = []

    counter = 1
    current_date = -1
    for i in xrange(news_np.shape[0]):
        row = news_np[i]
        if row[0] != current_date:
            current_date = row[0]
            counter = 1
        else:
            counter += 1

        news_np_score.append(np.append(row, counter))

    news_score = pd.DataFrame(
        news_np_score, columns=['Date', 'News', 'SentimentScore', 'Rank'])
    news_score = pd.pivot_table(news_score,
                                values='SentimentScore',
                                index='Date',
                                columns='Rank')
    news_score = news_score.iloc[:, 0:25]  #two days have 50 articles
    news_score.columns = ['col_vader_{}'.format(i) for i in news_score.columns]

    return news_score
Exemplo n.º 22
0
def averageScore(listOfTweets):
    #Receives a list of tweets and returns the average sentiment for each
    averages = {'pos': 0, 'neu': 0, 'neg': 0}

    for tweet in listOfTweets:
        sent = sentiment(tweet.text.encode("utf-8"))
        averages['pos'] += sent['pos']
        averages['neu'] += sent['neu']
        averages['neg'] += sent['neg']

    averages['pos'] /= len(listOfTweets)
    averages['neu'] /= len(listOfTweets)
    averages['neg'] /= len(listOfTweets)

    pos_norm = averages['pos'] / (averages['pos'] + averages['neg'])
    neg_norm = averages['neg'] / (averages['pos'] + averages['neg'])

    result = [pos_norm, neg_norm]

    return result
def averageScore(listOfTweets):
    #Receives a list of tweets and returns the average sentiment for each
    averages = {'pos': 0, 'neu': 0, 'neg': 0}
    
    for tweet in listOfTweets:
        sent = sentiment(tweet.text.encode("utf-8"))
        averages['pos'] += sent['pos']
        averages['neu'] += sent['neu']
        averages['neg'] += sent['neg']
    
    averages['pos'] /= len(listOfTweets)
    averages['neu'] /= len(listOfTweets)
    averages['neg'] /= len(listOfTweets)
    
    pos_norm = averages['pos']/(averages['pos']+averages['neg'])
    neg_norm = averages['neg']/(averages['pos']+averages['neg'])
    
    result = [pos_norm, neg_norm]
    
    return result
Exemplo n.º 24
0
    def on_status(self, status):
        """ Append sentiment values of each tweet to a size-limited array. """

        try:

            now = time()

            parsed_tweet = {}
            ds = set(dir(status))
            parsed_tweet["is_rt"] = "retweeted_status" in ds
            parsed_tweet["timestamp"] = mktime(status.created_at.timetuple())
            parsed_tweet["sentiment"] = sentiment(
                status.text.encode('utf-8'))["compound"]
            parsed_tweet["text"] = status.text.encode('utf-8')
            parsed_tweet["related_links"] = [
                lnk["url"].replace("\\", "") + "||" +
                lnk["display_url"].replace("\\", "")
                for lnk in status.entities["urls"]
            ]

            if len(status.entities["hashtags"]) == 0:
                # handle case of tweet with no hashtags
                parsed_tweet["related_hashtags"] = []
                parsed_tweet["hashtag"] = "(No Hashtag)"
                tweet_array.append((now, copy.deepcopy(parsed_tweet)))
            else:
                # separate stream entry for each hashtag
                for ht in status.entities["hashtags"]:
                    parsed_tweet["related_hashtags"] = [
                        lnk["text"] for lnk in status.entities["hashtags"]
                        if lnk["text"] != ht["text"]
                    ]
                    parsed_tweet["hashtag"] = ht["text"]
                    tweet_array.append((now, copy.deepcopy(parsed_tweet)))

        except KeyboardInterrupt:
            return False

        except Exception, e:
            print "Exception!", type(e).__name__, e
print datasets_to_collect

for f in files:
    user = TwitterUser(filename_for_tweets=f)

    if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\
        user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        tweet_set = [t for t in user.tweets if t.retweeted is None and\
                                                len(t.urls) == 0 and\
                                                len(t.tokens) > 5 and\
                                                t.created_at <= MIN_TWEET_DATE and\
                                                curr_dataset[0] in t.tokens and\
                                                langid.classify(t.text)[0] == 'en'and\
                                                sentiment(t.text)['compound'] != 0]
        if len(tweet_set) == 0:
            continue

        tweet = random.sample(tweet_set, 1)[0]
        print user.screen_name, curr_dataset[0:2], "::::  ", tweet.text
        curr_dataset[2].append(tweet)
        curr_dataset[1] -= 1
        if curr_dataset[1] == 0:
            pickle.dump(curr_dataset[2], open(curr_dataset[0] + ".p", 'wb'))
            if len(datasets_to_collect) == 1:
                print 'DONE!!!'
                break
            datasets_to_collect = datasets_to_collect[1:]
            curr_dataset = datasets_to_collect[0]
Exemplo n.º 26
0
def add_comments_history_correlation_features():
    """
	"""
    from pymongo import MongoClient
    from tqdm import tqdm
    from datetime import datetime
    from RMP_metadata import interest_lookup
    from vaderSentiment.vaderSentiment import sentiment
    from scipy.stats import pearsonr
    from pdb import set_trace

    set_trace()

    rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb']
    ds_cur = rmpdb['dataset_profs'].find({}, {'prof_id': 1},
                                         no_cursor_timeout=True)

    for row in tqdm(ds_cur):
        prof_id = row['prof_id']
        comments_cur = rmpdb['comments'].find({'prof_id': prof_id}, {
            '_id': 0,
            'rClarity': 1,
            'rEasy': 1,
            'rHelpful': 1,
            'rInterest': 1,
            'rComments': 1,
            'rDate': 1
        })
        comments = list()
        if comments_cur.count() > 2:
            for comment in comments_cur:
                comments.append(comment)

            comments_by_date = sorted(
                comments,
                key=lambda x: datetime.strptime(x['rDate'], '%m/%d/%Y'))

            help_list = list()
            clar_list = list()
            ease_list = list()
            interest_list = list()
            comment_positivity_list = list()
            comment_negativity_list = list()
            for comment in comments_by_date:
                help_list.append(float(comment['rHelpful']))
                clar_list.append(float(comment['rClarity']))
                ease_list.append(float(comment['rEasy']))
                if comment['rInterest'] in interest_lookup:
                    interest_list.append(interest_lookup[comment['rInterest']])
                sentiments = sentiment(comment['rComments'].encode('utf-8'))
                comment_positivity_list.append(sentiments['pos'])
                comment_negativity_list.append(sentiments['neg'])

            pparam1 = list()
            pparam2 = list()

            for i in range(1, len(help_list)):
                pparam1.append(float(sum(help_list[:i])) / len(help_list[:i]))
                pparam2.append(help_list[i])
            help_history_corr, help_p_val = pearsonr(pparam1, pparam2)

            pparam1 = list()
            pparam2 = list()

            for i in range(1, len(ease_list)):
                pparam1.append(float(sum(ease_list[:i])) / len(ease_list[:i]))
                pparam2.append(ease_list[i])
            ease_history_corr, ease_p_val = pearsonr(pparam1, pparam2)

            pparam1 = list()
            pparam2 = list()

            for i in range(1, len(clar_list)):
                pparam1.append(float(sum(clar_list[:i])) / len(clar_list[:i]))
                pparam2.append(clar_list[i])
            clar_history_corr, clar_p_val = pearsonr(pparam1, pparam2)

            pparam1 = list()
            pparam2 = list()

            for i in range(1, len(comment_positivity_list)):
                pparam1.append(
                    float(sum(comment_positivity_list[:i])) /
                    len(comment_positivity_list[:i]))
                pparam2.append(comment_positivity_list[i])
            comment_positivity_corr, comment_positivity_p_val = pearsonr(
                pparam1, pparam2)

            pparam1 = list()
            pparam2 = list()

            for i in range(1, len(comment_negativity_list)):
                pparam1.append(
                    float(sum(comment_negativity_list[:i])) /
                    len(comment_negativity_list[:i]))
                pparam2.append(comment_negativity_list[i])
            comment_negativity_corr, comment_negativity_p_val = pearsonr(
                pparam1, pparam2)

            insertion_dict = {
                '17': clar_history_corr,
                '18': ease_history_corr,
                '19': help_history_corr,
                '20': comment_positivity_corr,
                '21': comment_negativity_corr
            }

            if len(interest_list) > 2:

                pparam1 = list()
                pparam2 = list()

                for i in range(1, len(interest_list)):
                    pparam1.append(
                        float(sum(interest_list[:i])) / len(interest_list[:i]))
                    pparam2.append(interest_list[i])
                interest_history_corr, interest_p_val = pearsonr(
                    pparam1, pparam2)
                insertion_dict['22'] = interest_history_corr

            rmpdb['dataset_profs'].update_one({'_id': row['_id']},
                                              {'$set': insertion_dict})
Exemplo n.º 27
0
 def vaderSentiScore(self, doc):
     result = vs.sentiment(doc)
     return result
print datasets_to_collect

for f in files:
    user = TwitterUser(filename_for_tweets=f)

    if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\
        user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        tweet_set = [t for t in user.tweets if t.retweeted is None and\
                                                len(t.urls) == 0 and\
                                                len(t.tokens) > 5 and\
                                                t.created_at <= MIN_TWEET_DATE and\
                                                curr_dataset[0] in t.tokens and\
                                                langid.classify(t.text)[0] == 'en'and\
                                                sentiment(t.text)['compound'] != 0]
        if len(tweet_set) == 0:
            continue

        tweet = random.sample(tweet_set, 1)[0]
        print user.screen_name, curr_dataset[0:2], "::::  ", tweet.text
        curr_dataset[2].append(tweet)
        curr_dataset[1] -= 1
        if curr_dataset[1] == 0:
            pickle.dump(curr_dataset[2],open(curr_dataset[0]+".p",'wb'))
            if len(datasets_to_collect) == 1:
                print 'DONE!!!'
                break
            datasets_to_collect = datasets_to_collect[1:]
            curr_dataset = datasets_to_collect[0]
Exemplo n.º 29
0
def vader_sentiment(text):
	text = text.decode("ascii", errors="ignore")
	return vader.sentiment(text)["compound"]
Exemplo n.º 30
0
def sentiment_negative(tweet_text, language):
	if language is "en":
		return sentiment(tweet_text)['neg']
	else:
		return float(0)
Exemplo n.º 31
0
def sentiment_compound(tweet_text, language):
	if language is "en":
		return sentiment(tweet_text)['compound']
	else:
		return float(0)
Exemplo n.º 32
0
    numUnique = len(set(words))
    numTotal = len(words)
    return ((1.0 * numUnique)/numTotal)

for status in statuses:
    corpus = []
    for w in status['text'].split():
        w = removeUnicode(w)
        if w in skips: continue
        if 'http' in w: continue
        if '&amp;' in w: continue
        if '&gt;' in w: continue
        if 'RT' in w: continue
        corpus.append(w)
    unique = set(corpus)
    senti = sentiment(status['text'].encode('utf-8'))
    overall_sentiment += float(senti['compound'])
    print "User:"******"Favorite Count:", str(status['favorite_count'])
    print "Tweet:", removeUnicode(status['text'])
    print "Lexical Diversity: ", getLexicalDiversity(removeUnicode(status['text']))
    print "Retweet Count: ", status['retweet_count']
    print "Compound Sentiment:", senti['compound']
    print "Corpus:"
    for w in corpus: print '\t' + w
    print "Unique tokens:"
    for w in unique: print '\t' + w
    print "-----"

print "Sentiment Summation: %f" % overall_sentiment
pklfile = open('generated_files/restaurants.pkl', 'rb')
restaurants = pickle.load(pklfile)
pklfile.close()

pklfile = open('generated_files/reviews_user.pkl', 'rb')
reviews_user = pickle.load(pklfile)
pklfile.close()

pklfile = open('generated_files/users.pkl', 'rb')
users = pickle.load(pklfile)
pklfile.close()

snooty = []
allwords = []
mytest = reviews.values()[0]
for i in range(len(mytest)):
    review = mytest[i]['text']
    #    print review
    #    ptreview = plaintext(review['text']).encode('utf-8')
    # word list
    words = review.split()
    #    print words
    allwords += words
    snooty += [w for w in words if w in PROFANITY]

    # sentiment
    print review
    vs = sentiment(review)
    print str(vs), '\n', mytest[i]['stars'], '\n', mytest[i]['votes'], '\n'
Exemplo n.º 34
0
def scrape_ebay_site(url, config):

    # url = 'http://www.ebay.com/itm/Nikon-AF-S-DX-NIKKOR-55-200mm-f-4-5-6G-ED-VR-II-Lens-Factory-Refurbished-/
    # 311498162380?_trkparms=%26rpp_cid%3D5702b40de4b0826387589b2e%26rpp_icid%3D5702cf3fe4b079ecf2fa287f'

    r = requests.get(url)

    data = r.text

    soup = BeautifulSoup(data, 'html.parser')
    json_data = {}

    title_el = soup.find("h1", {"id": "itemTitle"})
    if not(title_el is None):
        json_data['title'] = title_el.text.strip('Details about')

    price_el = soup.find("span", {"id": "mm-saleDscPrc"}) or soup.find("span", {"id": "prcIsum"}) or \
        soup.find("span", {"id": "prcIsum_bidPrice"})
    if not (price_el is None):
        json_data['price'] = price_el.text.strip().lstrip('US ')

    savings_el = soup.find("span", {"id": "youSaveSTP"})
    if not (savings_el is None):
        json_data['savings'] = savings_el.text.strip()

    sold_quantity_el = soup.find("span", {"class": "w2b-sgl"})
    if not (sold_quantity_el is None):
        json_data['sold_quantity'] = sold_quantity_el.text.strip().rstrip(" sold")

    shipping_el = soup.find("span", {"id": "fshippingCost"})
    if not (shipping_el is None):
        json_data['shippingCost'] = shipping_el.text.strip()

    shipping_to_el = soup.find("div", {"class": "sh-sLoc"})
    if not (shipping_to_el is None):
        json_data['shippingTo'] = shipping_to_el.text.strip().lstrip("Shipping to: ")

    image_el = soup.find("img", {"id": "icImg"})
    if not(image_el is None):
        json_data['image'] = image_el.attrs["src"]

    brand_el = soup.find("h2", {"itemprop": "brand"})
    if not(brand_el is None):
        json_data['brand'] = brand_el.text

    json_data['merchant'] = {}

    merchant_el = soup.find("span", {"class": "mbg-nw"})
    if not (merchant_el is None):
        json_data['merchant']['name'] = merchant_el.text.title()

    merchant_sold_quantity = soup.find("span", {"class": "mbg-l"})
    if not (merchant_sold_quantity is None):
        json_data['merchant']['sold_quantity'] = merchant_sold_quantity.text.strip().lstrip('(').rstrip(')')

    merchant_feedback = soup.find("div", {"id": "si-fb"})
    if not (merchant_feedback is None):
        json_data['merchant']['feedback'] = merchant_feedback.text

    savings_el = soup.find("div", {"id": "mm-saleAmtSavedPrc"})
    if not (savings_el is None):
        json_data['savings'] = savings_el.text.strip()

    availability_el = soup.find("span", {"id": "qtySubTxt"})
    if not(availability_el is None):
            json_data['availability'] = availability_el.text.strip()

    return_in_el = soup.find("span", {"id": "vi-ret-accrd-txt"})
    if not (return_in_el is None):
        json_data['returnIn'] = return_in_el.text.strip()

    seller_fb_el = soup.find("div", {"id": "si-fb"})
    if not (seller_fb_el is None):
        json_data['sellerFeedback'] = seller_fb_el.text.strip()

    link_to_buy_el = soup.find("a", {"id": "binBtn_btn"})
    if not (link_to_buy_el is None):
        json_data['linkToBuy'] = link_to_buy_el.text.strip()

    payment_el_row = soup.find("div", {"id": "payDet1"})
    if not (payment_el_row is None):
        json_data['payment'] = {}
        index = 1

        for paymentEl in payment_el_row.find_all("img"):
            if not (paymentEl is None):
                json_data['payment'][index] = paymentEl.attrs['alt']
                index += 1

    json_data['rating'] = {}

    rating_el = soup.find("span", {"class": "num-of-rewiews"})
    if not (rating_el is None):
        review_count_el = rating_el.find("a")
        if not(review_count_el is None):
            json_data['rating']['review_count'] = review_count_el.text.strip().rstrip(' rating').rstrip('s')

    if json_data['rating'].get('review_count') is None:
        rating_el = soup.find("a", {"id": "_rvwlnk"})
        if not (rating_el is None):
            json_data['rating']['review_count'] = int(rating_el.text.strip().rstrip('s').rstrip(' rating'))

    average_rating_el = soup.find("span", {"class": "review--start--rating"}) or \
        soup.find("span", {"class": "ebay-review-start-rating"})
    if not(average_rating_el is None):
        json_data['rating']['average'] = average_rating_el.text.strip()

    review_summary_el = soup.find('ul', {'class': 'ebay-review-list'})
    if not(review_summary_el is None):
        json_data['rating']['stats'] = {}

        for review_row_el in review_summary_el.find_all('li', {'class': 'ebay-review-item'}):
            if not(review_row_el is None):
                review_name_el = review_row_el.find("p", {"class": "ebay-review-item-stars"})
                review_rating_el = review_row_el.find("div", {"class": "ebay-review-item-r"}).find("span")
                if review_name_el and review_rating_el:
                    json_data['rating']['stats'][review_name_el.text + ' star'] = review_rating_el.text

    reviews_el_block = soup.find("div", {"class": "reviews"})
    if not(reviews_el_block is None):
        json_data['reviews'] = {}
        index = 1
        overall_sentiment = 0
        all_reviews = ''

        for reviewsEl in reviews_el_block.find_all("div", {"class": "ebay-review-section"}):
            if not(reviewsEl is None):

                json_data['reviews'][index] = {}
                reviewed_by_el = reviewsEl.find("a", {"itemprop": "author"})
                if not(reviewed_by_el is None):
                    json_data['reviews'][index]['reviewed_by'] = reviewed_by_el.text.strip()

                review_rating_el = reviewsEl.find("div", {"class": "ebay-star-rating"})
                if not(review_rating_el is None):
                    json_data['reviews'][index]['review_rating'] = (review_rating_el.attrs.get("title") or review_rating_el.attrs.get("aria-label")).strip()

                review_name_el = reviewsEl.find("p", {"itemprop": "name"})
                if not(review_name_el is None):
                    json_data['reviews'][index]['title'] = review_name_el.text.strip()

                review_description_el = reviewsEl.find("p", {"itemprop": "reviewBody"})
                if not(review_description_el is None):

                    reviews_text = review_description_el.text.strip()
                    json_data['reviews'][index]['text'] = reviews_text

                    review_sentiment = sentiment(reviews_text)
                    all_reviews += reviews_text
                    overall_sentiment += review_sentiment['compound']

                reviewed_on_el = reviewsEl.find("span", {"itemprop": "datePublished"})
                if not(reviewed_on_el is None):
                    json_data['reviews'][index]['reviewed_on'] = reviewed_on_el.text.strip()

                review_attributes_el_block = reviewsEl.find("p", {"class": "review-attr"})
                if not(review_attributes_el_block is None):
                    attribute_index = 0
                    for reviewAttributesEl in review_attributes_el_block.find_all("span", {"class": "rvw-attr"}):
                        if not(reviewAttributesEl is None):

                            review_value_el = review_attributes_el_block.select("span.rvw-val")[attribute_index]
                            if not(review_value_el is None):

                                json_data['reviews'][index][reviewAttributesEl.text.strip()] = \
                                    review_value_el.text.strip()
                                attribute_index += 1

                    index += 1

        if all_reviews != '':
            json_data['tones'] = get_tone(all_reviews, config)
            json_data['overall_sentiment'] = round(overall_sentiment/index, 2)

    data = {
        'scraped_data': json_data
    }
    # print(product_data)

    return data
    def __init__(
        self,
        unit_id,
        date,
        sentiment_ids_map,
        identity_ids_map,
        gram_list,
        emoji_info,
        emoticon_to_eval_dim,
        hashtag_epa_data,
        vader_dict,
        dependency_parsed_conll=None,
        dependency_parsed_objects=None,
        raw_text=None,
        sent_values=None,
        verbose=False,
        node_must_be_identity=False,
        use_events=True,
        use_behaviors=True,
        use_isa=True,
        use_parent_child=True,
        use_own_full_sentence=True,
        use_clause_level=True,
        do_negation_on_full_sentence=True,
    ):
        """
        :param sentiment_ids_map: mapping from sentiment word we care about to its id
        :param identity_ids_map: mapping from identity we care about to its id
        :return: a map from identity ids to sentiment constraints on that identity
        :param dependency_parsed_conll:
        :param dependency_parsed_objects:
        :param raw_text:
        :return:
        """
        if not raw_text and not dependency_parsed_conll and not dependency_parsed_objects:
            raise Exception(
                "you didnt provide any data to the TextUnit constructor")

        self.unit_id = unit_id
        self.date = date

        # params for whether or not to use, e.g, behavioral constraints
        self.use_events = use_events
        self.use_behaviors = use_behaviors
        self.use_isa = use_isa
        self.use_parent_child = use_parent_child
        self.use_own_full_sentence = use_own_full_sentence
        self.use_clause_level = use_clause_level
        self.do_negation_on_full_sentence = do_negation_on_full_sentence
        #self.all_identity_words_to_epa = all_identity_words_to_epa

        ### MEAN FROM UGA DATA
        sent_values[ZERO_IDENTITY_INDICATOR + 'e'] = 0.0
        sent_values[ZERO_IDENTITY_INDICATOR + 'p'] = 0.0
        sent_values[ZERO_IDENTITY_INDICATOR + 'a'] = 0.0
        sentiment_ids_map[ZERO_IDENTITY_INDICATOR] = ZERO_IDENTITY_INDICATOR

        self.node_must_be_identity = node_must_be_identity

        self.identity_ids_map = identity_ids_map
        self.sentiment_ids_map = sentiment_ids_map
        if emoji_info:
            self.emojis_to_eval_dim = emoji_info[0]
            self.emoji_regex = emoji_info[1]
        else:
            self.emoticon_to_eval_dim = self.emoji_regex = None
        self.emoticon_to_eval_dim = emoticon_to_eval_dim
        self.hashtag_to_epa = hashtag_epa_data
        self.gram_list = gram_list
        self.verbose = verbose

        # for sentence-level E, P, A constraints using emojis, emoticons, hashtags
        self.sentence_level_e = list()
        self.sentence_level_p = list()
        self.sentence_level_a = list()

        # for debugging purposes, a human-readable view of the constraints in this sentence
        self.constraint_string_list = []
        self.thot_words = []

        # list of all constraints
        self.all_constraints = []

        # the identities in this tweet (binary yes/no)
        self.identities = []

        # to ensure no multiple constraints are added
        self.sentence_ids_to_constraints = defaultdict(set)

        if raw_text:
            self.raw_text = raw_text
            constraints_map = self.get_constraints_from_raw_text(raw_text)
        elif dependency_parsed_conll:
            constraints_map = self.get_constraints_from_conll(
                dependency_parsed_conll, sent_values)
        else:
            constraints_map = self.get_constraints_from_dep_objs(
                dependency_parsed_objects, sent_values)

        # store the identity ids for easy retrieval, but only those in our identity set
        iden_set = set(self.identity_ids_map.values())
        # self.identities += constraints_map.keys()
        self.identities = list(
            set([x for x in self.identities if x in iden_set]))

        # construct sentence-level constraint
        if vader_dict:
            self.sentence_level_e.append(
                sentiment(self.raw_text, vader_dict, 2.)['compound'])
        sent_e_value = self.get_value_for_constraint_from_list(
            self.sentence_level_e)
        sent_p_value = self.get_value_for_constraint_from_list(
            self.sentence_level_p)
        sent_a_value = self.get_value_for_constraint_from_list(
            self.sentence_level_a)

        # e will always have at least 1 if there are any sentence level vars
        if sent_e_value:
            self.constraint_string_list.append(
                "SENTENCE LEVEL: E: {e} P: {p} A: {a}".format(e=sent_e_value,
                                                              p=sent_p_value,
                                                              a=sent_a_value))
            for identity in set(self.identities):
                sl = SentenceLevelConstraint(identity, sent_e_value,
                                             sent_p_value, sent_a_value)
                constraints_map[identity].append(sl)
                self.all_constraints.append(sl)

        # construct the full deflection equation
        self.full_deflection_string = " + ".join(
            [c.get_constraint_string() for c in self.all_constraints])
        if len(self.full_deflection_string):
            self.full_deflection_string = SENT_REPLACE_REGEX.sub(
                lambda x: str(sent_values[x.group()]),
                self.full_deflection_string)
            self.full_deflection_string = str(
                sympify(self.full_deflection_string))
            self.full_deflection_string = ADD_UV_REGEX.sub(
                lambda x: "uv." + x.group(0), self.full_deflection_string)

        # store constraint strings for each identity
        self.identities_to_constraint_string_map = {}
        for identity, constraint_list in constraints_map.items():
            # if this isn't in the set we care about, keep going
            if identity not in iden_set:
                continue

            eq_constr = [
                constraint.get_constraint_string()
                for constraint in constraint_list
            ]

            equation_str = "+".join(eq_constr)
            equation_str = SENT_REPLACE_REGEX.sub(
                lambda x: str(sent_values[x.group()]), equation_str)
            constraint = sympify(equation_str)

            for val in ['e', 'p', 'a']:
                p = poly(constraint, Symbol(identity + val)).all_coeffs()

                if len(p) != 3:
                    if val == 'e':
                        print 'CONSTRAINT DIDNT WORK!!!!'
                        # print "\n".join(dependency_parsed_conll)
                    continue
                p_0 = "+".join([
                    "*".join([str(key),
                              FLOAT_FORMAT.format(float(v))])
                    for key, v in p[0].as_coefficients_dict().items()
                ])
                # try to make it a float, if it doesn't work, then it has variables in it
                try:
                    p_0 = eval(p_0)
                except:
                    p_0 = ADD_UV_REGEX.sub(lambda x: "uv." + x.group(0), p_0)
                p_1 = "+".join([
                    "*".join([str(key),
                              FLOAT_FORMAT.format(float(v))])
                    for key, v in p[1].as_coefficients_dict().items()
                ])
                try:
                    p_1 = eval(p_1)
                except:
                    p_1 = ADD_UV_REGEX.sub(lambda x: "uv." + x.group(0), p_1)
                self.identities_to_constraint_string_map[identity +
                                                         val] = [p_0, p_1]

        # release these things to save memory
        self.identity_ids_map = None
        #self.all_identity_words_to_epa = None
        self.sentiment_ids_map = None
        self.gram_list = None
        self.emojis_to_eval_dim = None
        self.emoji_regex = None
        self.emoticon_to_eval_dim = None
        self.hashtag_to_epa = None
Exemplo n.º 36
0
        return sentence
       


D = {}
D["amazon"]  = [0,0,0,0]
D["walmart"] = [0,0,0,0]

with open('final.csv', 'rb') as f:
    mycsv = csv.reader(f)
    lamazon = 0
    lwalmart = 0
    for row in mycsv:
    	txt = row[2]
    	pre = preprocess(txt)
        vs = vaderSentiment.sentiment(pre.encode("utf8"))
        if row[0] in A:
        	D["amazon"][0] += vs["neg"]
        	D["amazon"][1] += vs["neu"]
        	D["amazon"][2] += vs["pos"]
        	D["amazon"][3] += vs["compound"]
        	lamazon += 1
        elif row[0] in W:
        	D["walmart"][0] += vs["neg"]
        	D["walmart"][1] += vs["neu"]
        	D["walmart"][2] += vs["pos"]
        	D["walmart"][3] += vs["compound"]
        	lwalmart += 1
print D

D["amazon"][0] = D["amazon"][0] /lamazon
Exemplo n.º 37
0
def sentiment_positive(tweet_text, language):
	if language is "en":
		return sentiment(tweet_text)['pos']
	else:
		return float(0)
Exemplo n.º 38
0
    numTotal = len(words)
    return ((1.0 * numUnique) / numTotal)


for status in statuses:
    corpus = []
    for w in status['text'].split():
        w = removeUnicode(w)
        if w in skips: continue
        if 'http' in w: continue
        if '&amp;' in w: continue
        if '&gt;' in w: continue
        if 'RT' in w: continue
        corpus.append(w)
    unique = set(corpus)
    senti = sentiment(status['text'].encode('utf-8'))
    overall_sentiment += float(senti['compound'])
    print "User:"******"Favorite Count:", str(status['favorite_count'])
    print "Tweet:", removeUnicode(status['text'])
    print "Lexical Diversity: ", getLexicalDiversity(
        removeUnicode(status['text']))
    print "Retweet Count: ", status['retweet_count']
    print "Compound Sentiment:", senti['compound']
    print "Corpus:"
    for w in corpus:
        print '\t' + w
    print "Unique tokens:"
    for w in unique:
        print '\t' + w
    print "-----"
def add_comments_history_correlation_features():
	"""
	"""
	from pymongo import MongoClient
	from tqdm import tqdm
	from datetime import datetime
	from RMP_metadata import interest_lookup
	from vaderSentiment.vaderSentiment import sentiment
	from scipy.stats import pearsonr
	from pdb import set_trace

	set_trace()

	rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb']
	ds_cur = rmpdb['dataset_profs'].find({}, {'prof_id' : 1}, no_cursor_timeout = True)

	for row in tqdm(ds_cur):
		prof_id = row['prof_id']
		comments_cur = rmpdb['comments'].find({'prof_id' : prof_id}, {'_id' : 0, 'rClarity' : 1, 'rEasy' : 1, 'rHelpful' : 1, 'rInterest' : 1, 'rComments' : 1, 'rDate' : 1})
		comments = list()
		if comments_cur.count() > 2:
			for comment in comments_cur:
				comments.append(comment)

			comments_by_date = sorted(comments, key = lambda x: datetime.strptime(x['rDate'], '%m/%d/%Y'))

			help_list = list()
			clar_list = list()
			ease_list = list()
			interest_list = list()
			comment_positivity_list = list()
			comment_negativity_list = list()
			for comment in comments_by_date:
				help_list.append(float(comment['rHelpful']))
				clar_list.append(float(comment['rClarity']))
				ease_list.append(float(comment['rEasy']))
				if comment['rInterest'] in interest_lookup:
					interest_list.append(interest_lookup[comment['rInterest']])
				sentiments = sentiment(comment['rComments'].encode('utf-8'))
				comment_positivity_list.append(sentiments['pos'])
				comment_negativity_list.append(sentiments['neg'])

			pparam1 = list()
			pparam2 = list()

			for i in range(1, len(help_list)):
				pparam1.append(float(sum(help_list[:i])) / len(help_list[:i]))
				pparam2.append(help_list[i])
			help_history_corr, help_p_val = pearsonr(pparam1, pparam2)

			
			pparam1 = list()
			pparam2 = list()

			for i in range(1, len(ease_list)):
				pparam1.append(float(sum(ease_list[:i])) / len(ease_list[:i]))
				pparam2.append(ease_list[i])
			ease_history_corr, ease_p_val = pearsonr(pparam1, pparam2)

			pparam1 = list()
			pparam2 = list()

			for i in range(1, len(clar_list)):
				pparam1.append(float(sum(clar_list[:i])) / len(clar_list[:i]))
				pparam2.append(clar_list[i])			
			clar_history_corr, clar_p_val = pearsonr(pparam1, pparam2)

			pparam1 = list()
			pparam2 = list()

			for i in range(1, len(comment_positivity_list)):
				pparam1.append(float(sum(comment_positivity_list[:i])) / len(comment_positivity_list[:i]))
				pparam2.append(comment_positivity_list[i])			
			comment_positivity_corr, comment_positivity_p_val = pearsonr(pparam1, pparam2)

			pparam1 = list()
			pparam2 = list()

			for i in range(1, len(comment_negativity_list)):
				pparam1.append(float(sum(comment_negativity_list[:i])) / len(comment_negativity_list[:i]))
				pparam2.append(comment_negativity_list[i])			
			comment_negativity_corr, comment_negativity_p_val = pearsonr(pparam1, pparam2)

			insertion_dict = {	'17' : clar_history_corr,
								'18' : ease_history_corr,
								'19' : help_history_corr,
								'20' : comment_positivity_corr,
								'21' : comment_negativity_corr}

			if len(interest_list) > 2:

				pparam1 = list()
				pparam2 = list()

				for i in range(1, len(interest_list)):
					pparam1.append(float(sum(interest_list[:i])) / len(interest_list[:i]))
					pparam2.append(interest_list[i])			
				interest_history_corr, interest_p_val = pearsonr(pparam1, pparam2)
				insertion_dict['22'] = interest_history_corr

			rmpdb['dataset_profs'].update_one({'_id' : row['_id']}, {'$set' :insertion_dict})	
Exemplo n.º 40
0
def sentiment_neutral(tweet_text, language):
	if language is "en":
		return sentiment(tweet_text)['neu']
	else:
		return float(0)
Exemplo n.º 41
0
        finalanswer2[location] = 0
    	company_tweets = {}
    	for c in company_keywords:
            company_tweets[c] = []
	    for content in _location_tweets[location]:
                for kw in company_keywords[c]:
                    if kw in content:
                        company_tweets[c].append(content)
                        break
        for company in company_tweets:
                sentences = company_tweets[company]
                neg = 0.0
                neu = 0.0
                pos = 0.0
                for sentence in sentences:
                   vs = vaderSentiment.sentiment(sentence.encode("utf8"))
                   neg += vs["neg"]
                   neu += vs["neu"]
                   pos += vs["pos"]
                finalanswer1[location].append([company,(neg+neu+pos)])
                finalanswer2[location] += neg+neu+pos

# finalanswer2 gives the happiness of states from most to least
finalanswer2 = sorted(finalanswer2.items(), key=lambda x: (-x[1], x[0]))

# top 5 -> each company
for stateL in finalanswer2[:5]:
    print stateL[0]
    print finalanswer1[stateL[0]]

Exemplo n.º 42
0
        for line in f:
            users.append(line.strip())

    with open("data/sandy_all.txt", 'rt') as f:
        for line in f:
            tweet = line.split('\t')
            # print(tweet)
            user = tweet[0]

            # is this user in the master list
            if user not in users:
                continue

            if user not in table.keys():
                table[user] = {'total': 0, 'neutral': 0}

            # print('---')
            # print(tweet)
            ret = vaderSentiment.sentiment(tweet[1])
            # print(ret)

            if ret['neu'] >= threshold:
                table[user]['neutral'] += 1
            table[user]['total'] += 1

    with open('results.txt', 'w') as f:
        for user in sorted(table, key=lambda x: int(x)):
            # print(user, str(float(table[user]['neutral']) / float(table[user]['total'])))
            # f.write(user + ' ' + str(float(table[user]['neutral']) / float(table[user]['total'])) + '\n')
            f.write(str(float(table[user]['neutral']) / float(table[user]['total'])) + '\n')
Exemplo n.º 43
0
def scrape_rakuten_site(prod_url, config):

      r = requests.get(prod_url)

      data = r.text

      soup = BeautifulSoup(data, 'html.parser')
      json_data = {}

      title_el = soup.find("h1", {"id": "product-title-heading"})
      if not(title_el is None):
          json_data['title'] = str(title_el.text).strip()

      price_el = soup.find("span", {"class": "price"})
      if not (price_el is None):
          ##price_el = get_text(price_el)
          json_data['price'] = (price_el.text).strip()


      savings_el = soup.find("div", {"class": "text-muted"})
      if not (savings_el is None):
          json_data['savings'] = (savings_el.text).strip()


      shipping_el = soup.find(text="+ free shipping")
      if not (shipping_el is None):
          shipping_el = 'FREE'
          json_data['shippingCost'] = shipping_el
      else:
          shipping_el = 'EXTRA'
          json_data['shippingCost'] = shipping_el


      image_el = soup.find("img", {"id": "productmain"})
      if not(image_el is None):
          json_data['image'] = image_el.attrs["src"]


      brand_el = soup.find("td", {"class": "tab-table"})
      if not(brand_el is None):
          json_data['brand'] = MfgPart_el.text

      merchant_el = soup.find("div", {"class": "seller"})
      if not (merchant_el is None):
          json_data['merchant'] = str(merchant_el.text)

      availability_el = soup.find("strong", {"class": "text-success"})
      if not(availability_el is None):
          json_data['availability'] = get_text(availability_el)


 #   return_in_el = soup.find("span", {"id": "vi-ret-accrd-txt"})
 #   if not (return_in_el is None):
 #        json_data['returnIn'] = (return_in_el.text).strip()

      prod_description_el = soup.find("div", {"itemprop": "description"})
      if not (prod_description_el is None):
          json_data['ProdDescription'] = (prod_description_el.text).strip()

      link_to_buy_el = soup.find("a", {"a": "add-to-cart-main.add-to-cart.btn_btn"})
      if not (link_to_buy_el is None):
          json_data['linkToBuy'] = (link_to_buy_el.text).strip()

      json_data['reviews'] = {}
      json_data['rating'] = {}


      average_rating_block_el = soup.find("div", {"class": "rating-summary"})
      if not (average_rating_block_el is None):
          average_rating_el = average_rating_block_el.find("strong")

          if not(average_rating_el is None):
             json_data['rating']['average'] = get_text(average_rating_el)
             json_data['average'] = get_text(average_rating_el)
             #debug='jon'
             #print debug
             #print get_text(average_rating_el)
             #print debug



      review_count_el = soup.find("strong", {"class": "rating-indicator"})
      if not (review_count_el is None):
          json_data['rating']['review_count'] = int(str((review_count_el.text).strip()))
          json_data['review_count'] = int(str((review_count_el.text).strip()))

      #user_ratingEl_block = soup.findAll("div", {"class": "review-avg"})
      #json_data['rating']['stats'] = {}
      #if not(user_ratingEl_block is None):
       #  index = 1

         #for user_ratingEl in soup.findAll("div", {"class": "review-avg"}):
          #  if not(user_ratingEl is None):


                   #             review_rating_el = user_ratingEl.findAll("i", {"class": "fa fa-circle"}) or user_ratingEl.findAll("i", {"class": "fa fa-circle"})

                   #             if not(review_rating_el is None):
                   #               rating_score = len(review_rating_el)
                   #               rating_score = str(rating_score)
                    #              json_data['rating']['stats'][index+ ' star'] = rating_score

                    #            index +=1

      statsblock_el = soup.findAll("div", {"id": "ratings"})
      json_data['rating']['stats'] = {}
      index3 = 5
      if not(statsblock_el is None):
              for stats_el in soup.findAll("div", {"class": "rating-indicator"}):

                              debug = 'start'
                              debug2 = 'close'
                              #print debug

                              rating_score = stats_el.contents[0]
                              #print rating_score
                              name = str(index3)
                              #print name
                              #print debug2
                              json_data['rating']['stats'][name+ ' star'] = str(rating_score)
                              index3 -=1


      reviews_el_block = soup.findAll("p", {"class": "dotdotdot"})


      if not(reviews_el_block is None):
                json_data['reviews'] = {}
                index2 = 1
                overall_sentiment = 0


                for reviewsEl in soup.findAll("p", {"class": "dotdotdot"}):

                    if not(reviewsEl is None):

                        json_data['reviews'][index2] = {}

                        #reviewed_by_el = reviewsEl.find("em", {"class": "review-avg"})
                        #if not(reviewed_by_el is None):
                        #   json_data['reviews'][index2]['reviewed_by'] = reviewed_by_el.text.strip()




                        reviewDescription_el = reviewsEl.contents[0]

                        ##debug = 'review'
                        ##print reviewDescription_el
                        ##print debug



                        all_reviews = ''

                        if not(reviewDescription_el is None) and not(reviewDescription_el is None):
                            json_data['reviews'][index2]['text'] = str(reviewDescription_el)
                            all_reviews += str(reviewDescription_el)
                            review_sentiment = sentiment(str(reviewDescription_el))
                            overall_sentiment = review_sentiment


                            index2 += 1
                            if all_reviews != '':
                                all_tones = get_tone(all_reviews, config)
                                json_data['tones'] = str(all_tones)
                                db = 'start'
                                db2 = 'end'
                                print db
                                print overall_sentiment
                                print db2
                                json_data['overall_sentiment'] = overall_sentiment


#soup.find('div', class_='detail_date').find('dt', #text='Date').find_next_sibling('dd').text


      value = soup.find(text="Overall Satisfaction").findNext('dt').contents[0]
      #if not(reviewDescription_el is None):
      #   get_value = get_float(value)
      #   get_value = str(get_value)


      ease = soup.find(text="Value").findNext('dt').contents[0]
      performance = soup.find(text="Ease of Use").findNext('dt').contents[0]
      over = soup.find(text="Overall Satisfaction").findPrevious('dt').contents[0]

      get_over = get_float(over)
      get_over = str(get_over)

      get_ease = get_float(ease)
      get_ease = str(get_ease)

      get_performance = get_float(performance)
      get_performance = str(get_performance)

      get_value = get_float(value)
      get_value = str(get_value)

      json_data['Over'] = get_over
      json_data['Value'] = get_value
      json_data['Ease of Use'] = get_ease
      json_data['Performance'] = get_performance






      data = {
              'scraped_data': json_data
          }
          # print(product_data)

      return data