예제 #1
0
 def on_data(self, raw_data):
     tweet = loads(raw_data)
     try:
         text = tweet['text']
         if tweet.get('retweeted_status') is None and 'RT @' not in text:
             if tweet.get('coordinates') is None:
                 # TODO: Check for rate limit. If rate limited, then perform location inference
                 nouns = self._get_nouns(tweet_text=text)
                 # bf = BilateralFriends(user_id=tweet['user']['id'], twitter_api=self.api)
                 # loc_occurrence_count = bf.get_location_occurrence()
                 tweet_nouns = defaultdict(int)
                 for noun in nouns:
                     tweet_nouns[noun] += 1
                 self.corpus[tweet['user']['id']] = {'id': tweet['user']['id'],
                                                     'location': tweet['user']['location'],
                                                     # 'bilateral_friends_location_occurrences': loc_occurrence_count,
                                                     'text_nouns': tweet_nouns}
                 loc_inf = LocationInference(user=self.corpus[tweet['user']['id']], local_words=self.local_words,
                                             geo_words=self.geo_words)
                 inferred_location = loc_inf.get_location()
                 print inferred_location
                 print 'Predicted location:', inferred_location[0]
                 tweet['coordinates'] = {'type': 'Point', 'coordinates': [LOCATIONS[inferred_location[0]][1],
                                                                          LOCATIONS[inferred_location[0]][0]]}
                 print tweet['coordinates']
             sentiment_analyzer = SentimentIntensityAnalyzer()
             sentiment_score = sentiment_analyzer.polarity_scores(text=text)['compound']
             tweet['sentiment'] = sentiment_score
             current_time_ms = int(round(time() * 1000))
             tweet['time_inserted'] = current_time_ms
             print text, ': ', str(sentiment_score)
             STREAM_BUFFER.insert(tweet)
     except KeyError, v:
         print 'KeyError: ', v
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description="Reads in output from " +
        "downloadGroupmeMessages and runs a sentiment analysis")
    parser.add_argument("inFile", help="The file containing the stored messages")
    parser.add_argument("--outFile", default="out.txt", help="Results go here")
    args = parser.parse_args()
    
    print("\nThis program prints the most negative and positive users of the chat ranked according to their average score from the VADER sentiment intensity analyzer in the NLTK. Not super accurate, but it's a fun conversation starter")
    print("The program takes a few seconds to run, and requires that you have some of the NLTK corpora installed.")

    with open(args.inFile, 'r') as infile:
        infile.readline()
        analyzer = SentimentIntensityAnalyzer()
        negList = []
        positiveList = []
        counter = PostSentimentCounter()
        for line in infile:
            line = line.split('\t')
            message = line[3]
            id = line[0]
            name = line[1]
            
            sentDict = analyzer.polarity_scores(message)
            counter.countPost(id, name, sentDict)
        counter.printSentimentLeaderboards()
    def sentiment_analytis_text(self,text_insert):

        text = text_insert

        token_text = tokenize.sent_tokenize(text)
        sid = SentimentIntensityAnalyzer()

        over_all_sentiment = 0
        count = 0

        for sentence in token_text:
            score = sid.polarity_scores(sentence)
            # Create over all sentiment score
            over_all_sentiment += score.get("compound")

            # If sentence is not neuteral add to sentence count for average
            if (score.get("compound") >  0.1):
                count += 1

        # Calculate average sentiment
        if count > 0:
            average_sentiment = over_all_sentiment/count
        else:
            average_sentiment = over_all_sentiment

        return average_sentiment
예제 #4
0
 def add_sentiment(self):
     print 'Adding sentiment...',
     sia = SentimentIntensityAnalyzer()
     for sentiment in ('pos', 'neg', 'neu', 'compound'):
         sentify = lambda s: sia.polarity_scores(s[:200])[sentiment]
         self.df['sentiment_' + sentiment] = self.df['story body'].apply(sentify)
     print 'done'
예제 #5
0
def get_tweets(q, today):
    r = api.request(
        "search/tweets", {"q": "%s since:%s" % (q, today), "count": "100", "result_type": "recent", "lang": "en"}
    )
    data = (json.loads(r.text))["statuses"]
    sid = SentimentIntensityAnalyzer()
    all_tweets = []
    for i in range(0, len(data)):
        text = data[i]["text"].encode("ascii", "ignore").decode("ascii")
        if "RT" in text:
            RT = True
        else:
            RT = False
        others = text.count("@")
        sent = TextBlob(text)
        valance = sent.sentiment.polarity
        NLTK = sid.polarity_scores(text)
        tweet_data = {
            "tweetID": data[i]["id"],
            "created_at": data[i]["created_at"],
            "text": text,
            "textblob": valance,
            "NLTK": NLTK["compound"],
            "RT": RT,
            "others": others,
        }
        # print(data[i])
        all_tweets.append(tweet_data)
    return all_tweets
예제 #6
0
def nltk_sentiment(tweets):
	sentiment = []
	sid = SentimentIntensityAnalyzer()
	for tweet in tweets:
		st = sid.polarity_scores(tweet)
		sentiment.append(st['compound'])
	return sentiment
def analyze_sentiment_vader_lexicon(review, 
                                    threshold=0.1,
                                    verbose=False):
    # pre-process text
    review = normalize_accented_characters(review)
    review = html_parser.unescape(review)
    review = strip_html(review)
    # analyze the sentiment for review
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    # get aggregate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'positive' if agg_score >= threshold\
                                   else 'negative'
    if verbose:
        # display detailed sentiment statistics
        positive = str(round(scores['pos'], 2)*100)+'%'
        final = round(agg_score, 2)
        negative = str(round(scores['neg'], 2)*100)+'%'
        neutral = str(round(scores['neu'], 2)*100)+'%'
        sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                        negative, neutral]],
                                        columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Polarity Score',
                                                                       'Positive', 'Negative',
                                                                       'Neutral']], 
                                                              labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print sentiment_frame
    
    return final_sentiment
예제 #8
0
def add_sentiment_to_comments():
    sia = SentimentIntensityAnalyzer()
    for story_comment_list in comments.values():
        for comment in story_comment_list:
            if "text" in comment:
                comment["sentiment"] = sia.polarity_scores(comment["text"])
            print(comment) # here's where to add sentiment using nltk to text
예제 #9
0
def analyze(posts):
  post_json = setup_json()
  #for post, replies in posts.iteritems()

  sid = SentimentIntensityAnalyzer()
  for key, value in posts.iteritems():
    nustring = ' '.join(value[0]).replace("u'", "")
    ss = sid.polarity_scores(nustring)
    for k in sorted(ss):
      if k is "compound":
        entry = {}
        entry['name'] = int(ss[k]*len(nustring))
        entry['size'] = len(nustring)
        if ss[k] == 0.0:
          post_json['children'][1]['children'].append(entry)
        elif ss[k] < -0.8:
          post_json['children'][2]['children'][2]['children'].append(entry)
        elif ss[k] < -0.4:
          post_json['children'][2]['children'][1]['children'].append(entry)
        elif ss[k] < -0.0:
          post_json['children'][2]['children'][0]['children'].append(entry)
        elif ss[k] < 0.4:
          post_json['children'][0]['children'][0]['children'].append(entry)
        elif ss[k] < 0.8:
          post_json['children'][0]['children'][1]['children'].append(entry)
        else:
          post_json['children'][0]['children'][2]['children'].append(entry)
  return post_json
예제 #10
0
def sentiment_by_subreddit():
	phrase = urllib.quote(request.form["text"])
	year = urllib.quote(request.form["year"])

	sid = SentimentIntensityAnalyzer()

	year_str = str(year)
	if int(year) > 2014:
		year_str += "_01"

	query = '''SELECT subreddit, body, score FROM
	(SELECT subreddit, body, score, RAND() AS r1
	FROM [fh-bigquery:reddit_comments.''' + year_str + ''']
	WHERE REGEXP_MATCH(body, r'(?i:''' + phrase + ''')')
	AND subreddit IN (SELECT subreddit FROM (SELECT subreddit, count(*) AS c1 FROM [fh-bigquery:reddit_comments.''' + year_str + '''] WHERE REGEXP_MATCH(body, r'(?i:'''+phrase+''')') AND score > 1 GROUP BY subreddit ORDER BY c1 DESC LIMIT 10))
	ORDER BY r1
	LIMIT 5000)
	'''
	bigquery_service = build('bigquery', 'v2', credentials=credentials)
	try:
		query_request = bigquery_service.jobs()
		query_data = {
			'query': query,
			'timeoutMs': 30000
		}

		query_response = query_request.query(
			projectId=bigquery_pid,
			body=query_data).execute()

	except HttpError as err:
		print('Error: {}'.format(err.content))
		raise err
	
	subreddit_sentiments = defaultdict(list)
	subreddit_total = defaultdict(int)
	
	if 'rows' in query_response:
		rows = query_response['rows']
		sentiments = []
		for row in rows:
			subreddit = row['f'][0]['v']
			body = row['f'][1]['v']
			score = int(row['f'][2]['v'])
			sentiment_values = []
			
			lines_list = tokenize.sent_tokenize(body)
			for sentence in lines_list:
				if phrase.upper() in sentence.upper():#(regex.search(sentence)):
					s = sid.polarity_scores(sentence)
					sentiment_values.append(s['compound'])
		
			comment_sentiment = float(sum(sentiment_values))/len(sentiment_values)
			subreddit_sentiments[subreddit].append((comment_sentiment, score))
			subreddit_total[subreddit] += int(score)

	subreddit_sentiments = {subreddit:1 + float(sum([float(pair[0])*float(pair[1]) for pair in sentiment_list]))/subreddit_total[subreddit] for subreddit, sentiment_list in subreddit_sentiments.items()}
	result = sorted(subreddit_sentiments.items(), key = lambda(k,v): (-v,k))
	return json.dumps(result)
예제 #11
0
 def get_unique_tweets(self, data_dict):
     # TODO: Implement filter to check if Tweet text starts with 'RT'
     """
     :param data_dict:
     :return:
     """
     flag = False
     try:
         text = data_dict['text'].encode('ascii', 'ignore').lower()
         # Check for 'retweeted_status' in metadata field to determine
         # if tweet is a retweet (1st check)
         if 'retweeted_status' not in data_dict:
             url_match = URL.match(text)
             # Check if link contains url
             if url_match:
                 match_group = url_match.group()
                 if len(self.key_list) > 0:
                     if any(match_group in item for item in self.key_list):
                         flag = True
                     if flag is False:
                         data_dict['text'] = match_group
                         print "Inserted text: " + data_dict['text'] + '\n'
                         self.key_list.append(match_group)
                         sid = SentimentIntensityAnalyzer()
                         ss = sid.polarity_scores(text)
                         print ss['compound']
                         score = ss['compound']
                         if score < 0:
                             score += (3 * score)
                         for w in GOOGLE:
                             if w in text and self.google_price >= 0:
                                 self.google_price = score
                                 self.google_text = text
                         for w in MICROSOFT:
                             if w in text and self.microsoft_price >= 0:
                                 self.microsoft_price = score
                                 self.microsoft_text = text
                         for w in FACEBOOK:
                             if w in text and self.facebook_price >= 0:
                                 self.facebook_price = score
                                 self.facebook_text = text
                         p.trigger('test_channel', 'my_event',
                                   {'google': self.google_price,
                                    'microsoft': self.microsoft_price,
                                    'facebook': self.facebook_price})
                         p.trigger('tweet_channel', 'my_event',
                                   {
                                       'google_text': self.google_text,
                                       'microsoft_text': self.microsoft_text,
                                       'facebook_text' : self.facebook_text
                                   })
                         self.google_price = 0
                         self.microsoft_price = 0
                         self.facebook_price = 0
                 else:
                     self.key_list.append(url_match.group())
     except TypeError, e:
         print >> sys.stderr, e
         self.log_error(str(e))
예제 #12
0
    def vader(self):
        sid = SentimentIntensityAnalyzer()
        results = {'neg': 0.0, 'pos': 0.0, 'neu': 0.0, 'compound': 0.0}
        ss = sid.polarity_scores(self.text)
        for k in sorted(ss):
            results[k] += ss[k]

        return results
예제 #13
0
def sentimentScore(sentences):
	analyzer = SentimentIntensityAnalyzer()
	results = []
	for sentence in sentences:
		vs = analyzer.polarity_scores(sentence)
		print("vs: " + str(vs))
		results.append(vs)
	return results
예제 #14
0
 def avg_message_sentiment_helper(self, message):
     sentences = tokenize.sent_tokenize(message)
     sid = SentimentIntensityAnalyzer()
     sentence_sentiments = []
     for sentence in sentences:
         ss = sid.polarity_scores(sentence)
         sentence_sentiments.append(ss['compound'])
     return np.mean(sentence_sentiments)
예제 #15
0
    def computeVaderScore(self,sentence):
        sid = SentimentIntensityAnalyzer()
        ss = sid.polarity_scores(sentence)
        retList = []
        for k in sorted(ss):
            retList.append(ss[k])

        return retList
예제 #16
0
    def negative_msg_count(self):
        sid = SentimentIntensityAnalyzer()
        msg_count = 0
        for sentence in self.get_message_lst():
            ss = sid.polarity_scores(sentence)
            if ss['compound'] < 0:
                msg_count += 1

        return msg_count
예제 #17
0
def sentiment(sentence):
		sid = SentimentIntensityAnalyzer()
		ss = sid.polarity_scores(sentence)
		if float(ss['neg']) > float(ss['pos']) :
				return -1*float(ss['neg'])
		elif float(ss['neg']) < float(ss['pos']):
			return float(ss['pos'])
		else:
			return 0
예제 #18
0
	def process(self, tup):

		# extract the sentence
		sentence = tup.values[0]  

		sid = SentimentIntensityAnalyzer()
		ss = sid.polarity_scores(sentence)
		tuple_result = (str(ss['neg']),str(ss['pos']),str(ss['neu']))
		self.emit(tuple_result)
예제 #19
0
def get_sentiment_score(sentence):
    score_dict = {}
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        # print('{0}: {1}, '.format(k, ss[k]), end='')
        score_dict[k] = ss[k]

    return score_dict
def vader_sentiment_scores(text_array):
    sid = SentimentIntensityAnalyzer()
    assert all([type(t) == type('') for t in text_array])
    vs_dict = {'neg': [], 'neu': [], 'pos': [], 'compound': []}
    for i, text in enumerate(text_array):
        if i % 10000 == 0:
            print(i)
        vs = sid.polarity_scores(text)
        for key, value in vs.items():
            vs_dict[key].append(value)
    return vs_dict
예제 #21
0
def getSentiments(sentences):
    from nltk import tokenize

    sid = SentimentIntensityAnalyzer()
    sentimtents = [None] * len(sentences)
    i = 0
    for sentence in sentences:
        ss = sid.polarity_scores(sentence)
        sentimtents[i] = ss["compound"]
        i = i + 1
    return sentimtents
def personAvgSentiment(person, month = None, year = None):
	from nltk.sentiment.vader import SentimentIntensityAnalyzer
	comp = 0
	sid = SentimentIntensityAnalyzer()
	if month:
		msgLst = fullMessageListMonth(person, month, year)
	else:	
		msgLst = fullMessageList(person)
	for message in msgLst:
		sentimentDict = sid.polarity_scores(message)
		comp += sentimentDict['compound']
	return comp/len(msgLst) if len(msgLst) != 0 else 0
예제 #23
0
def get_score(candidate,sent_weight,twitter_positions,fb_positions,official_positions,nyt_positions):
    """
    " Get position similarity score between candidate and user. Computed
    " using cosine similarity. 
    "
    " Args:
    "   candidate: candidate name
    "   twitter_positions: Elasticsearch results from twitter index
    "   fb_positions: Elasticsearch results from fb index
    "   official_positions: Elasticsearch results from official index
    "   nyt_positions: Elasticsearch results form NYT index
    " Returns:
    "    Similarity score between candidate and user
    "
    " Author: Kersing Huang <*****@*****.**>, Matthew Garber
    """
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    text = []
    by_candidate = lambda x: x['candidate'] == candidate and len(x['result']['hits']['hits']) > 0
    t = filter(by_candidate,twitter_positions)
    if len(t) > 0:
        text.append(t[0]['result']['hits']['hits'][0]['_source']['sm_text'])
    #end if
    fb = filter(by_candidate,fb_positions)
    if len(fb) > 0:
        text.append(fb[0]['result']['hits']['hits'][0]['_source']['sm_text'])
    #end if

    sia = SentimentIntensityAnalyzer()

    # get candidate polarity scores
    candidate_scores = sia.polarity_scores("".join(text))

    # get user polarity scores
    user_input = json.loads(web.data())
    user_scores = sia.polarity_scores(user_input['position'])

    # compute cosine similarity of these polarity scores
    u_len = vector_length(user_scores)
    c_len = vector_length(candidate_scores)
    sentiment_score = vector_dot(candidate_scores,user_scores)/(u_len*c_len)

    official = filter(by_candidate,official_positions)
    nyt = filter(by_candidate,nyt_positions)
    relevance_score_sum = 0
    for source in [t, fb, official, nyt]:
        if source:
            relevance_score_sum += source[0]['score']
    relevance_score = relevance_score_sum/4
    weighted_score = (sent_weight * sentiment_score) + ((1 - sent_weight) * relevance_score)
    return weighted_score
def sentiAnalyze():
    analyzer = SentimentIntensityAnalyzer()


    for candidate in tweets:
        for week in tweets[candidate]:
            for tweet in tweets[candidate][week]:
                if candidate not in result['sum']:
                    result['sum'][candidate] = {}
                if 'total' not in result['sum'][candidate]:
                    result['sum'][candidate]['total'] = 0
                if week not in result['sum'][candidate]:
                    result['sum'][candidate][week] = 0

                result['sum'][candidate][week] += 1
                result['sum'][candidate]['total'] += 1


                score = analyzer.polarity_scores(tweet[2])
                if (score['pos'] - score['neg']) > DISTINCT_THRESHOLD:
                    if candidate not in pos_tweets:
                        pos_tweets[candidate] = {}
                    if week not in pos_tweets[candidate]:
                        pos_tweets[candidate][week] = []
                    pos_tweets[candidate][week].append(tweet)
                   
                    if candidate not in result['pos']:
                        result['pos'][candidate] = {}
                    if 'total' not in result['pos'][candidate]:
                        result['pos'][candidate]['total'] = 0
                    if week not in result['pos'][candidate]:
                        result['pos'][candidate][week] = 0
                    result['pos'][candidate][week] += 1
                    result['pos'][candidate]['total'] += 1

                if (score['neg'] - score['pos']) > DISTINCT_THRESHOLD:
                    if candidate not in neg_tweets:
                        neg_tweets[candidate] = {}
                    if week not in neg_tweets[candidate]:
                        neg_tweets[candidate][week] = []
                    neg_tweets[candidate][week].append(tweet)

                    if candidate not in result['neg']:
                        result['neg'][candidate] = {}
                    if 'total' not in result['neg'][candidate]:
                        result['neg'][candidate]['total'] = 0
                    if week not in result['neg'][candidate]:
                        result['neg'][candidate][week] = 0
                    result['neg'][candidate][week] += 1
                    result['neg'][candidate]['total'] += 1

    return
예제 #25
0
def vader_sentiment(df):

        sith = SentimentIntensityAnalyzer()
        
        sentiment = []
        for sentence in df.Message:
                sent = sith.polarity_scores(sentence)
                #sent_total = sent['pos'] - sent['neg']

                sentiment.append(sent['compound'])
  
        df['sentiment'] = sentiment
        return df
예제 #26
0
 def get_news_sentiment(self,team):
     print '------------------------------'
     print 'Scanning sentiment for: ',team
     print '------------------------------'
     story_limit=5 
     neg=0
     pos=0
     visible_text=''
     base_url='https://www.google.co.uk/search?hl=en&gl=uk&tbm=nws&authuser=0&q=football+european+championships'+team.replace(' ','%20')
     req = urllib2.Request(base_url,headers = {'User-Agent': 'Mozilla/5.0'} )
     print base_url
     page = urllib2.urlopen(req)
     soup = BeautifulSoup(page)
     # remove scripts and tags
     for script in soup(["script", "style"]):
             script.extract()    # rip it out
     # find the links
     sentence_count=0
     story_count=0
     for a in soup.findAll('a'):
         if story_count<story_limit:
             # ignore internal google links and remove the tracking guff from the end of the URL
             if ('google' not in a.attrs['href'] and '/search?q=football+european+championships' not in a.attrs['href'] and 'http://' in a.attrs['href']):
                 print a.attrs['href'].replace('/url?q=','').split('&')[0]
                 story_req=urllib2.Request(a.attrs['href'].replace('/url?q=','').split('&')[0],headers={'User-Agent': 'Mozilla/5.0'})
                 try:
                     story_page=urllib2.urlopen(story_req,timeout=5)
                     story_soup=BeautifulSoup(story_page)
                     # analyse text
                     visible_text = story_soup.getText()
                     neg=0
                     pos=0
                     sentences = tokenize.sent_tokenize(visible_text)
                     sid = SentimentIntensityAnalyzer()
                     # for each sentence in the story, get the sentiment/polarity
                     for sentence in sentences:
                         ss = sid.polarity_scores(sentence)
                         neg=neg+ss['neg']
                         pos=pos+ss['pos']
                     sentence_count=sentence_count+ len(sentences)
                     score=score+pos-neg
                     # print out a story by story sentiment for logging
                     print 'Sentiment: ',(pos-neg)/len(sentences)
                 except socket.timeout as e:
                     print type(e)    
                 except urllib2.HTTPError:
                     print 'failed on url: ',a.attrs['href']
                 story_count=story_count+1 
     # return the average net polarity/sentiment
     return (pos-neg)/sentence_count
     
예제 #27
0
def ProcessReviews(df, ptype):
    parse_type = ptype

    # Divide reviews into individual sentences
    sentences = df['text'].apply(tokenizetext)

    # Stick the sentences back into the dataframe
    df['sentlist'] = sentences
    d1, d2, d3 = [], [], []
    d4, d5, d6 = [], [], []

    # Initialize the sentiment vader analyzer
    sid = SentimentIntensityAnalyzer()

    # Loop over sentences and process them
    for i in range(0, df.shape[0]):
        sent_list = df['sentlist'][i]
        for sentence in sent_list:
            sent_raw = ''.join(sentence)
            sent_pro = strip_punctuation(sent_raw)
            sent_pro = rmstopwords(sent_pro)
            sent_pro = lemmatize(sent_pro)
            sentiment = sid.polarity_scores(sent_raw)['compound']
            if parse_type[0] == 'ngram':
                pos = ngrams(sent_pro, ptype[1])
            elif parse_type == 'chunk':
                pos = extract_candidate_chunks(sent_pro)
            elif parse_type == 'rake':
                pos = rake_object.run(sent_raw)
                pos = ['_'.join(word[0].split()) for word in pos]
            for j in pos:
                d1.append(df['date'][i])
                d2.append(df['location'][i])
                d3.append(df['rating'][i])
                d4.append(j),
                d5.append(sentiment)
                d6.append(sent_raw)

    # Put everything in a dataframe
    processed_df = pd.DataFrame()
    processed_df['date']      = d1
    processed_df['location']  = d2
    processed_df['rating']    = d3
    processed_df['aspects']   = d4
    processed_df['sentiment'] = d5
    processed_df['context']   = d6

    # Remove any entry where the sentence
    # was determined to be neutral
    processed_df = processed_df[(processed_df['sentiment'] != 0)]
    return processed_df
예제 #28
0
def clean_df(user_df):
    user_df['simple_text'] = user_df['text'].apply(lambda x: remove_links(x))

    #A warning will appear if Twython is not installed
    sid = SentimentIntensityAnalyzer()
    user_df['sentiment'] = user_df['text'].apply(lambda x: sid.polarity_scores(x))

    #keep only fairly positive tweets
    pos_user_tweets = user_df[user_df['sentiment'].apply(lambda x: x['neg']) < .45]

    pos_user_tweets['stemmed_text'] = pos_user_tweets['simple_text'].apply(lambda tweet: remove_punctuation(tweet))

    pos_user_tweets['stripped_text'] = pos_user_tweets['stemmed_text'].apply(lambda tweet: remove_irrelevant_terms(tweet))
    return pos_user_tweets
예제 #29
0
파일: api.py 프로젝트: jakereps/xfoliate
def run():
    nltk_dl_dir = nltk.downloader.Downloader()._download_dir
    vader_path = os.path.join(nltk_dl_dir, 'sentiment', 'vader_lexicon.zip')
    if not os.path.exists(vader_path):
        nltk.download('vader_lexicon')

    sia = SentimentIntensityAnalyzer()
    fb_at = click.prompt(__initial_message, type=str, prompt_suffix='>> ')

    for post in FacebookHandler(fb_at).get_posts():
        try:
            ps = sia.polarity_scores(post['message'])
            click.echo('%s:\n%s\n' % (post['message'], ps))
        except KeyError:
            pass
def sentiment_analysis():
    sid = SentimentIntensityAnalyzer()

    for country in countries:
        for query in queries:
            for evaluator in evaluators[country]:
                f = open(country + "/" + query + "/suggestions_" + query + "_" + evaluator + ".csv", 'r')
                g = open(country + "/" + query + "/suggestions_" + query + "_" + evaluator + "_sentiments.csv", 'w')

                for record in f:
                    sentence = record.split(",")[1]
                    ss = sid.polarity_scores(sentence)
                    record = record.split("\n")[0]
                    record += ", " + str(ss) + "\n"
                    g.write(record)
예제 #31
0
def sentimentText(chatname):
    #Text group sentiment
    text = getListChat(chatname)
    sia = SentimentIntensityAnalyzer()
    #Needs to be a string of phrases
    return sia.polarity_scores(text)
class StdOutListener(StreamListener):
    def __init__(self, time_interval=60, term='trump'):
        self.sid = SentimentIntensityAnalyzer()
        self.anomalyDetector = probabilisticEWMA(term)
        self.interval = time_interval
        self.tweetBuffer = deque(maxlen=50)
        self.negCount = 0
        self.siesta = 0
        self.nightnight = 0
        self.trend = 0
        self.start = time.time()

    def on_data(self, data):

        if (time.time() - self.start) < self.interval:
            try:
                tweet = json.loads(data)['text']
                processedTweet = processTweet(tweet)
                sentiment = self.sid.polarity_scores(
                    processedTweet)['compound']

                if sentiment < 0.0:
                    self.negCount += 1
            except:
                pass

        else:
            # Number of negative mentions per elapsed time (in seconds)
            frequency = self.negCount / (time.time() - self.start)

            # Reseting configurations
            self.negCount = 0
            self.start = time.time()

            self.tweetBuffer.append(frequency)
            anom = self.anomalyDetector.predict(self.tweetBuffer)

            status = 0  # 0 for normal, 1 for positive anomaly, -1 for negative anomaly

            # Check if the last added data point produced an anomaly
            if anom:
                if anom[-1] == len(self.tweetBuffer) - 2:
                    if self.tweetBuffer[-1] > self.tweetBuffer[-2]:
                        self.trend = 1
                        status = 1
                        #beforeSpike = self.tweetBuffer[-2]
                        #print '[' + str(datetime.now()) + '] Alert: anomaly detected'
                    else:
                        self.trend = 0
                        status = -1
            #else:
            #    if frequency < beforeSpike:
            #        self.trend = 0

            jsonData = {
                "status": status,
                "trend": self.trend,
                "frequency": frequency,
                "timestamp": datetime.utcnow().isoformat()
            }
            postData(jsonData)

            print "Frequency: ", frequency, " | Trend: ", self.trend, " | Status: ", status

        return True

    def on_error(self, status_code):
        print 'Error:', str(status_code)

        if status_code == 420:
            sleepy = 60 * math.pow(2, self.siesta)
            print "A reconnection attempt will occur in " + \
            str(sleepy/60) + " minutes."
            time.sleep(sleepy)
            self.siesta += 1
        else:
            sleepy = 5 * math.pow(2, self.nightnight)
            print "A reconnection attempt will occur in " + \
            str(sleepy) + " seconds."
            time.sleep(sleepy)
            self.nightnight += 1
        return True
예제 #33
0
    cursor.execute(
        """
        INSERT INTO results (stock, mentions)
        VALUES (%s, %s)
    """, (i, symbols[i]))

    cnx.commit()

    # print(f"{i}: {symbols[i]}")

    # times.append(symbols[i])
    # top.append(f"{i}: {symbols[i]}")

# Applying Sentiment Analysis
scores, s = {}, {}
vader = SentimentIntensityAnalyzer()
# adding custom words from data.py
vader.lexicon.update(new_words)

picks_sentiment = list(symbols.keys())[0:picks_ayz]
for symbol in picks_sentiment:
    stock_comments = a_comments[symbol]
    for cmnt in stock_comments:
        score = vader.polarity_scores(cmnt)
        if symbol in s:
            s[symbol][cmnt] = score
        else:
            s[symbol] = {cmnt: score}
        if symbol in scores:
            for key, _ in score.items():
                scores[symbol][key] += score[key]
예제 #34
0
positiveSent = 0
negativeSent = 0
neutralSent = 0
polarity = 0
tweets = []
positive_tweets = []
negative_tweets = []
neutral_tweets = []

begin_time = datetime.datetime.now()

for tweet in vaccine_tweets:

    tweets.append(tweet)
    analysis = TextBlob(tweet)
    score = SentimentIntensityAnalyzer().polarity_scores(tweet)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']
    polarity += analysis.sentiment.polarity

    if neg > pos:
        negative_tweets.append(tweet)
        negativeSent += 1
    elif pos > neg:
        positive_tweets.append(tweet)
        positiveSent += 1

    elif pos == neg:
        neutral_tweets.append(tweet)
예제 #35
0
def scrap_news():

    parsed_news = []
    from datetime import date
    today = date.today()

    # NLTK VADER for sentiment analysis
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    # New words and values
    new_words = {
        'crushes': 10,
        'beats': 5,
        'misses': -5,
        'trouble': -10,
        'falls': -100,
    }
    # Instantiate the sentiment intensity analyzer with the existing lexicon
    vader = SentimentIntensityAnalyzer()
    # Update the lexicon
    vader.lexicon.update(new_words)

    # Use these column names
    columns = ['ticker', 'date', 'time', 'headline']

    for url in url_list:

        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')

        news = soup.find('table', {'id': 'news-table'})
        tr = news.findAll('tr')

        for x in tr:

            text = x.get_text()

            date_scrape = x.td.text.split()
            headline = x.a.text

            if len(date_scrape) == 1:
                time = date_scrape[0]
            else:
                date = date_scrape[0]
                time = date_scrape[1]

            # Extract the ticker from the file name, get the string up to the 1st '_'
            ticker = url.split('=')[1]
            # Append ticker, date, time and headline as a list to the 'parsed_news' list
            parsed_news.append([ticker, date, time, headline])

        # print(parsed_news[:10])

        # Convert the list of lists into a DataFrame
        scored_news = pd.DataFrame(parsed_news, columns=columns)

        # Iterate through the headlines and get the polarity scores
        scores = scored_news['headline'].apply(vader.polarity_scores)

        # Convert the list of dicts into a DataFrame
        scores_df = pd.DataFrame.from_records(scores)

        # Join the DataFrames
        scored_news = scored_news.join(scores_df)

        # Convert the date column from string to datetime
        scored_news['date'] = pd.to_datetime(scored_news.date).dt.date
        # print(scored_news.tail())
    # DF TO EXCEL
    todays_news = scored_news[scored_news['date'] == today]
    writer = ExcelWriter('sentiment_analysis.xlsx')
    todays_news.to_excel(writer, 'sentiment')
    writer.save()
예제 #36
0
selectDF_quotes_hdfs = selectDF_quotes.select(
    simulate_time("latestUpdate").cast("Timestamp").alias("Datetime"),
    simulate_price("latestPrice").alias("latestPrice"), "symbol")
selectDF_quotes_es = selectDF_quotes.select(
    get_id_now("latestUpdate").alias("depotid"),
    simulate_time("latestUpdate").cast("String").alias("date"),
    simulate_price("latestPrice").cast("float").alias("latestPrice"), "symbol")

# news
selectDF_news = parsedDF_news \
        .select(F.explode(F.array("news_data")))\
        .select("col.*") \
        .dropna()

##################### sentiment analysis of news #############################
sia = SentimentIntensityAnalyzer()
#update lexicon
with open('lexicon_data/final_lex.json', 'r') as fp:
    final_lex = json.load(fp)

final_lex.update(sia.lexicon)
sia.lexicon = final_lex


def sentiment_analysis(txt1, txt2):
    text = txt1 + ' ' + txt2
    return sia.polarity_scores(text)['compound']


sentiment_analysis_udf = F.udf(sentiment_analysis, FloatType())
##############################################################################
예제 #37
0

def get_features(track_id):
    features_results = sp.audio_features([track_id])
    json_features = json.dumps(features_results)
    features_data = json.loads(json_features)

    # Convert features dictionary to a list
    features_list = list(features_data[0].values())

    return features_list


client_credentials_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
sentiment_analyzer = SIA()

# IDs of monthly playlists from November 2016 to November 2017
playlist_ids = [
    "07zqCIPCroFMKSajvERGvE", "30PgYnoeT2PAgFNuYLR5qd",
    "1vS1nakUrLYkTd3W0yRMYe", "3scPGVlAn7d74uXRtFnmUC",
    "5LzBRPVAPYUssZ4ZESnRmH", "6hDHXewz8qBTezvONSqzyl",
    "00riJCYiVJ1yptAXtv2h6k", "0HxFI5dOlKztf38T9sa0cF",
    "7EFWm7Mjy6GLJHOEgKEblM", "6YAG0Li1BoUkmhc8iycY6l",
    "7Iw0yI71QX59zyFq0kAZTS", "69XTCqVzbSWPMLucSvzlLl",
    "7pRnKuQMkmntEj7Nnj94r0"
]

# Audio features
feature_names = [
    "danceability", "energy", "key", "loudness", "mode", "speechiness",
예제 #38
0
def sa_tweets(tweets_df, image_path, image_title):
    """
  apply a sentiment classifier to a dataframe of tweets

  Given a dataframe including a 'text' field, it runs a sentiment classifier 
  and it returns an augmented version of the original dataframe that includes
  the results of the analysis in the fields 'neg', 'neu', pos', and 'compound'.
  It also generates a bar graph of the results.

  Parameters
  ----------
  tweets_df:  DataFrame
              It must include a 'text' field.
  
  image_path: str
              filepath for the bar graph
              
  image_title:str
              image caption

  Returns
  -------
    DataFrame
    a tweets' dataframe including the original fields and the new fields
    'neg', 'neu', pos', and 'compound' that contain the results of the
    analysis.
  """
    tweets_df['cleaned_text'] = tweets_df['text'].apply(preprocess_tweet_text)

    #show the dataframe
    #tweets_df.head
    sid = SentimentIntensityAnalyzer()

    tweets_df['neg'] = 0.0
    tweets_df['neu'] = 0.0
    tweets_df['pos'] = 0.0
    tweets_df['compound'] = 0.0

    for i in range(len(tweets_df)):
        aux_list = dict_converter(
            sid.polarity_scores(tweets_df.loc[i, 'cleaned_text']))
        tweets_df.loc[i, 'neg'] = aux_list[0][1]
        tweets_df.loc[i, 'neu'] = aux_list[1][1]
        tweets_df.loc[i, 'pos'] = aux_list[2][1]
        tweets_df.loc[i, 'compound'] = aux_list[3][1]

    print(tweets_df[['cleaned_text', 'compound']])

    thres = 0.25
    num_pos = len(tweets_df[tweets_df.compound >= thres])
    num_neg = len(tweets_df[tweets_df.compound <= -thres])
    num_neu = len(tweets_df[(tweets_df.compound < thres)
                            & (tweets_df.compound > -thres)])

    num_tweets = len(tweets_df.index)
    per_pos = (num_pos / num_tweets) * 100
    per_neg = (num_neg / num_tweets) * 100
    per_neu = (num_neu / num_tweets) * 100

    print("positive tweets: " + str(num_pos))
    print("negative tweets: " + str(num_neg))
    print("neutral tweets: " + str(num_neu))

    classification = ['Positive', 'Negative', 'Hard to classify/Neutral']
    scores = [num_pos, num_neg, num_neu]
    percentages = [per_pos, per_neg, per_neu]

    y_pos = np.arange(len(classification))
    #plt.bar(y_pos,scores, align='center',  color=['#EE442F', '#00ff00', '#0000ff'])
    plt.bar(y_pos,
            percentages,
            align='center',
            color=['#ABC3C9', '#E0DCD3', '#CCBE9F'])
    plt.ylabel("Percentage")
    plt.xticks(y_pos, classification)
    plt.title(image_title + '(' + str(num_tweets) + ' tweets)')

    plt.savefig(image_path + '.png')
    plt.close('all')
    #plt.show()

    return tweets_df
예제 #39
0
    def __init__(self, master=None):
        """This class creates a window with two frames."""
        Frame.__init__(self, master)
        self.grid()
        self.master.title("Reddit conversation sentiment.")

        # Configuring the window with rows and columns.
        for r in range(6):
            self.master.rowconfigure(r, weight=1)
        for c in range(6):
            self.master.columnconfigure(c, weight=1)

        # Creates two frames
        self.frame1 = Frame(master, height=100, width=300, bg="white")
        self.frame1.grid(row=0,
                         column=0,
                         rowspan=6,
                         columnspan=3,
                         sticky=W + E + N + S)
        self.frame2 = Frame(master, height=100, width=150)
        self.frame2.grid(row=0,
                         column=3,
                         rowspan=6,
                         columnspan=3,
                         sticky=W + E + N + S)

        # Creates labels that explain the different filters
        Label(self.frame2,
              text="Min. number of participants:").grid(column=0,
                                                        row=0,
                                                        sticky='w')
        Label(self.frame2,
              text="Max. number of participants:").grid(column=0,
                                                        row=1,
                                                        sticky='w')
        Label(self.frame2,
              text="Min. length of conversation:").grid(column=0,
                                                        row=2,
                                                        sticky='w')
        Label(self.frame2,
              text="Max. length of conversation:").grid(column=0,
                                                        row=3,
                                                        sticky='w')
        Label(self.frame2,
              text="Min. response time in minutes:").grid(column=0,
                                                          row=4,
                                                          sticky='w')
        Label(self.frame2,
              text="Max. response time in minutes:").grid(column=0,
                                                          row=5,
                                                          sticky='w')
        Label(self.frame2, text="Sentiment:").grid(column=0, row=6, sticky='w')

        # Create update button
        Button(self.frame2, text="Update", command=self.start).grid(column=1,
                                                                    row=12)

        # Create filter buttons
        self.min_participants = StringVar(self.frame2)
        self.min_participants.set("2")
        self.opt = OptionMenu(self.frame2, self.min_participants, "2", "3",
                              "4", "5", "6", "7", "8", "9",
                              "10").grid(column=1, row=0)

        self.max_participants = StringVar(self.frame2)
        self.max_participants.set("10")
        self.opt2 = OptionMenu(self.frame2, self.max_participants, "2", "3",
                               "4", "5", "6", "7", "8", "9",
                               "10").grid(column=1, row=1)

        # Create entry for min comments
        self.min_comments = Entry(self.frame2, width=6)
        self.min_comments.insert(0, "2")
        self.min_comments.grid(column=1, row=2)

        # Create entry for max comments
        self.max_comments = Entry(self.frame2, width=6)
        self.max_comments.insert(10, "10")
        self.max_comments.grid(column=1, row=3)

        # Creates spinbox for min time
        self.time_value = StringVar(self.frame2)
        self.time_value.set("0")
        self.min_time = Spinbox(self.frame2,
                                from_=0,
                                to=2880,
                                increment=1,
                                textvariable=self.time_value,
                                width=4).grid(column=1, row=4)

        # Creates spinbox for max time
        self.time_value2 = IntVar(self.frame2)
        self.time_value2.set("2880")
        self.max_time = Spinbox(self.frame2,
                                from_=-0,
                                to=2880,
                                increment=1,
                                textvariable=self.time_value2,
                                width=4).grid(column=1, row=5)

        # Creates an option menu where you can pick the sentiment you want
        self.sentiment = StringVar(self.frame2)
        self.sentiment.set("Neutral")
        self.opt3 = OptionMenu(self.frame2, self.sentiment, "Neutral",
                               "Positive", "Negative").grid(column=1, row=6)

        # Create a menu instance
        menu = Menu(self.master)
        self.master.config(menu=menu)

        # Create the file objects
        menu_file = Menu(menu)
        menu.add_cascade(menu=menu_file, label='File')
        menu_file.add_command(label='Open', command=self.load)
        menu_file.add_command(label='Exit', command=self.master.quit)

        menu_edit = Menu(menu)
        menu.add_cascade(menu=menu_edit, label='Search')
        menu_edit.add_command(label='Search username',
                              command=self.search_username)

        menu_3 = Menu(menu)
        menu.add_cascade(menu=menu_3, label='Credentials')
        menu_3.add_command(label='Reset', command=self.reset)
        menu_3.add_command(label='Change', command=self.newcredentials)

        # Adds a seperator between menufiles.
        menu_file.add_separator()

        # Creates text in the window with a scrollbar wich can be edited .
        self.S = Scrollbar(self.frame1)
        self.T = Text(self.frame1)
        self.S.pack(side=RIGHT, fill=Y)
        self.T.pack(side=LEFT, fill=Y)
        self.S.config(command=self.T.yview)
        self.T.config(yscrollcommand=self.S.set)

        # Logging in on Reddit.
        self.username = "******"
        self.password = "******"
        self.client_id = "CLIENT_ID"
        self.client_secret = "CLIENT_SECRET"
        self.bot_login()
        self.file = None
        self.sid = SentimentIntensityAnalyzer()
예제 #40
0
from speech_helpers import convert_to_wav, show_pydub_stats, transcribe_audio
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
예제 #41
0
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('vader_lexicon')

nlp = spacy.load("en_core_web_md")
sid = SentimentIntensityAnalyzer()

text_with_enteties = ""
text_with_pos = ""

with open("../tasks/02-structural-linguistics/data/examiner-headlines.txt"
          ) as file:
    data = file.readlines()
statistic = {'entity': 0, 'more lvl': 0, 'most lvl': 0, 'polarity': 0}
for line in tqdm(data):
    have_entenies = False
    have_more_lvl = False
    have_most_lvl = False
    have_polarity = False
    doc = nlp(line)
    for token in doc:
예제 #42
0
def initial_sentiment_tools():
    rs = textacy.resources.DepecheMood(lang="en", word_rep="lemmapos")
    sid = SentimentIntensityAnalyzer()
    return rs, sid
예제 #43
0
data_df['sentiment_lemmatized'] = data_df['tweet_lemmatized'].apply(
    lambda x: TextBlob(x).sentiment)
data_df[['sentiment_lemmatized', 'tweet_lemmatized']].head(10)

all_words = ' '.join([text for text in data_df['tweet_lemmatized']])
wordcloud = WordCloud(width=800,
                      height=500,
                      random_state=21,
                      max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title("Most Common words in column Tweet Lemmatized")
plt.show()

sid = SentimentIntensityAnalyzer()
data_df['sentiment_lemmatized'] = data_df['tweet_lemmatized'].apply(
    lambda x: sid.polarity_scores(x))
data_df['sentiment_lemmatized'].head(10)


def convert(x):
    if x < -0.05:
        return 0
    elif -0.05 < x < 0.05:
        return 1
    else:
        return 2


data_df['label_lemmatized'] = data_df['sentiment_lemmatized'].apply(
예제 #44
0
def main():
    print("scraping...")

    f = open("news.txt", "r")
    apiKey = f.readline().strip('\n')

    # https://newsapi.org/docs/endpoints/sources
    sources = []
    google_us_news_index_URL = "https://newsapi.org/v2/sources?category=business&language=en&country=us&apiKey={}".format(
        apiKey)
    google_response = requests.get(google_us_news_index_URL)
    if (google_response.status_code != 200):
        print(google_response)
        return
    google_json = json.loads(google_response.content)
    for source in range(len(google_json['sources'])):
        sources.append(google_json['sources'][source]['id'])

    print("# of sources: ", len(sources))
    source_string = ','.join(sources)
    print(source_string)

    # overwriting sources so I can test with only one source
    # source_string = "bloomberg"

    query_date_from = date.today().replace(month=date.today().month - 1)
    # print(str(query_date))
    # query_date_to = "&" + date.today().replace(month=date.today().month-1)

    # to query for specific words and such
    query = "q=AAPL&"

    # initialize sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # https://newsapi.org/docs/endpoints/everything
    for page in range(1, 6):
        google_news_URL = "http://newsapi.org/v2/everything?{}sources={}&from={}&language=en&page={}&apiKey={}".format(
            query, source_string, query_date_from, page, apiKey)
        print(google_news_URL + '\n')
        google_response = requests.get(google_news_URL)
        if (google_response.status_code != 200):
            print(google_response)
            print(google_response.text)
            return
        # only get a certain number of results, need to use page=___ in request to get other results
        # google_content = open("google_reponse.json", "w")
        google_json = json.loads(google_response.content)
        # json.dump(google_json, google_content)
        # google_content.close()
        for article in range(len(google_json['articles'])):
            print(google_json['articles'][article]['source']['id'])
            print(google_json['articles'][article]['publishedAt'])
            print(google_json['articles'][article]['title'])
            # print(google_json['articles'][article]['description'])

            # get vader scores (sentiment analysis) for article titles
            scores = sid.polarity_scores(
                google_json['articles'][article]['title'])
            for k in sorted(scores):
                print('{0}: {1}, '.format(k, scores[k]), end='')
                print()
            print("-------------------")

    # Database schema
    # companies : id , name , symbol
    # articles : id , source_id , author , date , title , description , compound_vader , neg_vader , neu_vader , pos_vader

    # if we also store sources, we could get the average of the vader scores they give for different companies (source x vader x company)
    # would need to be stored in a different table
    # sources table and sources_company_vader table

    # start of cnn request, couldn't get api key
    '''
예제 #45
0
def doAll(trainFileName, testFileName):
    trainSet = makeListEntries(trainFileName)
    testSet = makeListEntries(testFileName)
    """**************************************"""
    # data
    listTrainText = makeListText(trainSet)
    listTestText = makeListText(testSet)

    # target
    listTrainStars = makeListStars(trainSet)
    listTestStars = makeListStars(testSet)
    """*************************************"""
    # could do CountVectorizer
    cv = CountVectorizer(stop_words='english')

    trainCVMatr = cv.fit_transform(listTrainText)
    testCVMatr = cv.transform(listTestText)

    # could do TfidfVectorizer
    # tv = TfidfVectorizer(stop_words = 'english')

    # trainTVMatr = cv.fit_transform(listTrainText)
    # testTVMatr = cv.transform(listTestText)
    """*************************************"""
    # using CountVectorizer
    LR_CV_model = LogisticRegression(multi_class='multinomial',
                                     max_iter=1000,
                                     class_weight='balanced')
    LR_CV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    LR_CV_prediction = LR_CV_model.predict(testCVMatr)

    # get accuracy score
    LR_CV_score = metrics.accuracy_score(listTestStars, LR_CV_prediction)
    LR_CV_f1 = metrics.f1_score(listTestStars,
                                LR_CV_prediction,
                                average='micro')
    LR_CV_r2 = metrics.r2_score(listTestStars,
                                LR_CV_prediction,
                                multioutput='variance_weighted')
    LR_my = betterScoring(listTestStars, LR_CV_prediction)
    # this is the bit with the tfidf vectorizer
    # LR_TV_model = LogisticRegression(multi_class = 'multinomial', max_iter=1000)
    # LR_TV_model.fit(trainTVMatr, listTrainStars)

    # get it to predict
    # LR_TV_prediction = LR_TV_model.predict(testTVMatr)

    # get accuracy score
    # LR_TV_score = metrics.accuracy_score(listTestStars, LR_TV_prediction)

    # what do the data say?
    #print("Multiclass, logistic regression, CountVectorizer: " + str(LR_CV_score))
    #print("Multiclass, logistic regression, TfidfVectorizer: " + str(LR_TV_score))
    """*************************************"""
    # using CountVectorizer
    NB_CV_model = MultinomialNB()
    NB_CV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    NB_CV_prediction = NB_CV_model.predict(testCVMatr)

    # get accuracy score
    NB_CV_score = metrics.accuracy_score(listTestStars, NB_CV_prediction)
    NB_CV_f1 = metrics.f1_score(listTestStars,
                                NB_CV_prediction,
                                average='micro')
    NB_CV_r2 = metrics.r2_score(listTestStars,
                                NB_CV_prediction,
                                multioutput='variance_weighted')
    NB_my = betterScoring(listTestStars, NB_CV_prediction)
    # this is the bit with the tfidf vectorizer
    # NB_TV_model = MultinomialNB()
    # NB_TV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    # NB_TV_prediction = NB_TV_model.predict(testTVMatr)

    # get accuracy score
    # NB_TV_score = metrics.accuracy_score(listTestStars, NB_TV_prediction)

    # what do the data say?
    #print("Naive Bayes, CountVectorizer: " + str(NB_CV_score))
    # print("Naive Bayes, TfidfVectorizer: " + str(NB_TV_score))
    """*************************************"""
    sid = SentimentIntensityAnalyzer()
    listOfRes = []

    data2 = [json.loads(line) for line in open(testFileName, 'r')]

    for entry in data2:
        listOfRes.append(sid.polarity_scores(entry['review_body'])['compound'])

    scaledRes = []
    size = len(listOfRes)
    for i in range(size):
        num = listOfRes[i]
        score = -1
        if num >= q0 and num < q1:
            score = 1
        elif num >= q1 and num < q2:
            score = 2
        elif num >= q2 and num < q3:
            score = 3
        elif num >= q3 and num < q4:
            score = 4
        elif num >= q4 and num <= q5:
            score = 5

        # add score back in
        scaledRes.append(score)

    vader_acc = metrics.accuracy_score(listTestStars, scaledRes)
    vader_f1 = metrics.f1_score(listTestStars, scaledRes, average='micro')
    vader_r2 = metrics.r2_score(listTestStars,
                                scaledRes,
                                multioutput='variance_weighted')
    vader_my = betterScoring(listTestStars, scaledRes)
    """*************************************"""
    # dealing with the ordinal regression
    ord_model = OrdinalClassifier(DecisionTreeClassifier())
    ord_model.fit(trainCVMatr, listTrainStars)
    ord_model_prediction = ord_model.predict(testCVMatr)

    size = len(listTestStars)
    for i in range(size):
        if (ord_model_prediction[i] < 1):
            ord_model_prediction[i] = 1

    ord_acc = metrics.accuracy_score(listTestStars, ord_model_prediction)
    ord_f1 = metrics.f1_score(listTestStars,
                              ord_model_prediction,
                              average='micro')
    ord_r2 = metrics.r2_score(listTestStars,
                              ord_model_prediction,
                              multioutput='variance_weighted')
    ord_my = betterScoring(listTestStars, ord_model_prediction)
    """*************************************"""
    # trying mord

    arr = np.asarray(listTrainStars)
    clf2 = mord.LogisticAT(alpha=1.)
    clf2.fit(trainCVMatr, arr)
    clf2_prediction = clf2.predict(testCVMatr)

    LAT_acc = metrics.accuracy_score(listTestStars, clf2_prediction)
    LAT_f1 = metrics.f1_score(listTestStars, clf2_prediction, average='micro')
    LAT_r2 = metrics.r2_score(listTestStars,
                              clf2_prediction,
                              multioutput='variance_weighted')
    LAT_my = betterScoring(listTestStars, clf2_prediction)
    #print('AccuracyScore of LogisticAT %s' %
    #metrics.accuracy_score(listTestStars, clf2.predict(testCVMatr)))

    clf3 = mord.LogisticIT(alpha=1.)
    clf3.fit(trainCVMatr, arr)
    clf3_prediction = clf3.predict(testCVMatr)

    LIT_acc = metrics.accuracy_score(listTestStars, clf3_prediction)
    LIT_f1 = metrics.f1_score(listTestStars, clf3_prediction, average='micro')
    LIT_r2 = metrics.r2_score(listTestStars,
                              clf3_prediction,
                              multioutput='variance_weighted')
    LIT_my = betterScoring(listTestStars, clf3_prediction)
    #print('AccuracyScore of LogisticIT %s' %
    #metrics.accuracy_score(listTestStars, clf3.predict(testCVMatr)))

    clf4 = mord.LogisticSE(alpha=1.)
    clf4.fit(trainCVMatr, arr)
    clf4_prediction = clf4.predict(testCVMatr)

    LSE_acc = metrics.accuracy_score(listTestStars, clf4_prediction)
    LSE_f1 = metrics.f1_score(listTestStars, clf4_prediction, average='micro')
    LSE_r2 = metrics.r2_score(listTestStars,
                              clf4_prediction,
                              multioutput='variance_weighted')
    LSE_my = betterScoring(listTestStars, clf4_prediction)
    #print('AccuracyScore of LogisticSE %s' %
    #metrics.accuracy_score(listTestStars, clf4.predict(testCVMatr)))
    """*************************************"""

    # return value
    categoryName = trainFileName.replace("dataset/prodAnalysis/train_", "")
    categoryName = categoryName.replace(".json", "")
    return [
        categoryName,
        LR_CV_score,
        LR_CV_f1,
        LR_CV_r2,
        LR_my,
        NB_CV_score,
        NB_CV_f1,
        NB_CV_r2,
        NB_my,
        vader_acc,
        vader_f1,
        vader_r2,
        vader_my,
        ord_acc,
        ord_f1,
        ord_r2,
        ord_my,
        LAT_acc,
        LAT_f1,
        LAT_r2,
        LAT_my,
        LIT_acc,
        LIT_f1,
        LIT_r2,
        LIT_my,
        LSE_acc,
        LSE_f1,
        LSE_r2,
        LSE_my,
    ]
예제 #46
0
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import nltk
from matplotlib import pyplot as plt
import datetime
from datetime import date
import time
import seaborn as sns
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

nltk.download('vader_lexicon')
sia = SIA()
#===============================================================================
#dataset
#===============================================================================
response = pd.read_html(
    "https://finance.yahoo.com/quote/GOOG/history?period1=1555525800&period2=1571337000&interval=1d&filter=history&frequency=1d"
)
df = pd.DataFrame()
for data in response:
    df = data
df = df[:-1]
df = df.reindex(index=df.index[::-1])
datetime.datetime.strptime(df['Date'][0], '%b %d, %Y')
df.loc[:, 'Date'] = pd.to_datetime(df['Date'], format='%b %d, %Y')
df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]
df.sort_values(by='date', inplace=True, ascending=True)
#How to perform sentiment analysis in python?
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
message_text = '''kiddy is a good boy'''
print(message_text)
scores = sid.polarity_scores(message_text)
for key in sorted(scores):
    print('{0}: {1}, '.format(key, scores[key]), end='')

#output:

#good
#compound: 0.4404, neg: 0.0, neu: 0.0, pos: 1.0

#bad
#compound: -0.5423, neg: 1.0, neu: 0.0, pos: 0.0

#pranay is a bad boy. he is a psycho. i hate him. i want to kill him.
#compound: -0.9118, neg: 0.536, neu: 0.405, pos: 0.059,
예제 #48
0
def get_dressed(data, kb):

    result = []

    translator = str.maketrans('', '', string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'golden', 'globe', 'globes']
    stop = remove_terms
    include_terms = ['dress', 'fashion', 'red', 'carpet', 'haute couture', 'gown', 'design', 'look']

    for tweet in data:

        tweet = re.sub("\d+", "", tweet)       #strip nums
        tweet = re.sub(r'http\S+', '', tweet)  #strip urls
        tweet = re.sub(r'#\S+', '', tweet)     #strip hashtags
        tweet = tweet.translate(translator)    #strip non-alphanumeric characters

        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)

        if any(term in tweet for term in include_terms):
            result.append(tweet)


    # extract people and put in dictionary with compound scores
    sentiment_analyzer = SentimentIntensityAnalyzer()
    score_dict = {}
    best_dressed_dict = {}
    worst_dressed_dict = {}

    for tweet in result:
        all_scores = sentiment_analyzer.polarity_scores(tweet)
        for k in sorted(all_scores):
            if k == 'compound':
                useful_score = all_scores[k]

        if tweet:
            # Get all possible bigrams & trigrams in a tweet
            gram = list(nltk.everygrams(tweet.split(), 2, 3))

            # Filter through and append to list for tweet
            for g in gram:
                if len(g) == 2:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])):
                        name = ' '.join(g).lower()
                        if useful_score > 0:
                            if name in best_dressed_dict:
                                best_dressed_dict[name] += useful_score
                            else:
                                best_dressed_dict[name] = useful_score
                        if useful_score < 0:
                            if name in worst_dressed_dict:
                                worst_dressed_dict[name] += useful_score
                            else:
                                worst_dressed_dict[name] = useful_score
                else:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])):
                        name = ' '.join(g).lower()
                        if useful_score > 0:
                            if name in best_dressed_dict:
                                best_dressed_dict[name] += useful_score
                            else:
                                best_dressed_dict[name] = useful_score
                        if useful_score < 0:
                            if name in worst_dressed_dict:
                                worst_dressed_dict[name] += useful_score
                            else:
                                worst_dressed_dict[name] = useful_score


    # look in kb for matches
    final_dict = {}
    for best, worst in zip(best_dressed_dict, worst_dressed_dict):
        if best.lower() in kb:
            final_dict[best] = best_dressed_dict[best]
        if worst.lower() in kb:
            final_dict[worst] = worst_dressed_dict[worst]


    # get key with max value
    best_dressed = []
    worst_dressed = []

    while len(best_dressed) < 5:
        best_person = max(final_dict.items(), key=lambda k: k[1])[0]
        worst_person = min(final_dict.items(), key=lambda k: k[1])[0]
        best_dressed.append(best_person)
        worst_dressed.append(worst_person)
        del final_dict[best_person]
        del final_dict[worst_person]

    return best_dressed, worst_dressed
예제 #49
0
import sys
import re
import matplotlib.pyplot as plt
import nltk
from utilities import cleanText
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentiment_analyzer = SentimentIntensityAnalyzer()


def analyze(name):
    linesList = cleanText(name)
    neutral, negative, positive = 0, 0, 0

    for index, sentence in enumerate(linesList):
        # print("Processing {0}%".format(str((index * 100) / len(linesList))))

        if re.match(r'^[\w]', sentence) is None:
            continue

        scores = sentiment_analyzer.polarity_scores(sentence)
        scores.pop('compound', None)

        maxAttribute = max(scores, key=lambda k: scores[k])

        if maxAttribute == "neu":
            neutral += 1
        elif maxAttribute == "neg":
            negative += 1
        else:
            positive += 1
예제 #50
0
#print(len(word_and_adjectives_hash_synset))
#print(len(lem_nouns))

new_most_common_nouns=updated_lem_nouns.most_common(20)
# print("------------------------------------------------------------------------")

for mcn in new_most_common_nouns:
    d = word_and_adjectives_hash_synset[mcn[0]]
    d= d.most_common(10)
    print(mcn[0], mcn[1])
    # for i in d:
    #     print ("             ",i)

#sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

#calculate average score for each feature
for mcn in new_most_common_nouns:
    d = word_and_adjectives_hash_synset[mcn[0]]
    d= d.most_common(100)
    average_score=0
    total_adj=1
    print(mcn[0])
    for i in d:
        cmp_score=sid.polarity_scores(i[0])["compound"]
        #Ignore neutral compound score [0], for negetive score, consider it as 0, and positive as 1
        if cmp_score!=0:
            if cmp_score < 0:
                cmp_score=0
            else:
def get_vader_sentiment(textstr):
    # Instantiate the sentiment intensity analyzer
    vader = SentimentIntensityAnalyzer()

    return vader.polarity_scores(textstr)
예제 #52
0
파일: analyzer.py 프로젝트: WimFlorijn/NLP
import os
import json
import shutil
import time

from collections import Counter
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.util import ngrams

from .dataset import TwitterDataSet
from .tokenizer import Tokenizer

default_sid = SentimentIntensityAnalyzer()

DEFAULT_CONFIG_DIR = "config"
DEFAULT_INPUT_DATA_DIR = "data"
DEFAULT_OUTPUT_DATA_DIR = "processed_data"
DEFAULT_RESULT_DIR = "results"
DEFAULT_EXCEPTION_LIST = ['HASH_TAG', 'URL', 'I', 'AMP']
MAX_N_GRAM = 4
TOP_N_VALUES = 5


class Analyzer:

    tokenizer = Tokenizer()

    def __init__(self,
                 config_dir=DEFAULT_CONFIG_DIR,
                 data_dir=DEFAULT_INPUT_DATA_DIR,
예제 #53
0
def manageRequest():

    # some useful initialisation
    print('--->', 'TR' in request.form.values())
    theInputForm = InputTextForm()
    userText = "and not leave this empty!"
    typeText = "You should write something ..."
    language = "EN"
    language = request.form['lang']  # which language?

    # POST - retrieve all user submitted data
    if theInputForm.validate_on_submit():
        userText = theInputForm.inputText.data
        typeText = "Your own text"

    # DEBUG flash('read:  %s' % typeText)

    stemmingEnabled = request.form.get("stemming")
    stemmingType = TextAnalyser.NO_STEMMING

    if stemmingEnabled:
        if request.form.get("engine"):
            stemmingType = TextAnalyser.STEM
        else:
            stemmingType = TextAnalyser.LEMMA
    else:
        stemmingType = TextAnalyser.NO_STEMMING

    #flash('read:  %s' % stemmingEnabled)

    # Which kind of user action ?
    if 'TR' in request.form.values():
        print("888888888888888")

        blob = TextBlob(userText, analyzer=NaiveBayesAnalyzer())

        translate_to = language
        print("translate_response:--------->", translate_to)

        print(blob, translate_to)
        if (translate_to.lower() == 'french'):
            resp = str(blob.translate(to='fr'))
        elif (translate_to.lower() == 'telugu'):
            resp = str(blob.translate(to='te'))
        elif (translate_to.lower() == 'hindi'):
            resp = str(blob.translate(to='hi'))
        elif (translate_to.lower() == 'german'):
            resp = str(blob.translate(to='de'))
        elif (translate_to.lower() == 'greek'):
            resp = str(blob.translate(to='el'))
        elif (translate_to.lower() == 'spanish'):
            resp = str(blob.translate(to='es'))

        return render_template('translation_results.html',
                               title='Translation',
                               translation=resp)

    elif 'BA' in request.form.values():
        # GO Text Analysis

        print("------------")
        myText = TextAnalyser(userText, language)  # new object

        myText.preprocessText(
            lowercase=theInputForm.ignoreCase.data,
            removeStopWords=theInputForm.ignoreStopWords.data,
            stemming=stemmingType)

        # display all user text if short otherwise the first fragment of it
        if len(userText) > 99:
            fragment = userText[:99] + " ..."
        else:
            fragment = userText

            # check that there is at least one unique token to avoid division by 0
        if myText.uniqueTokens() == 0:
            uniqueTokensText = 1
        else:
            uniqueTokensText = myText.uniqueTokens()

            # render the html page
        return render_template('results.html',
                               title='Text Analysis',
                               inputTypeText=typeText,
                               originalText=fragment,
                               numChars=myText.length(),
                               numSentences=myText.getSentences(),
                               numTokens=myText.getTokens(),
                               uniqueTokens=uniqueTokensText,
                               commonWords=myText.getMostCommonWords(10))
    else:
        print("11111111111111")
        sid = SentimentIntensityAnalyzer()
        scores = sid.polarity_scores(userText)
        if scores['compound'] > 0:
            sentiment = 'Positive'
        elif scores['compound'] < 0:
            sentiment = 'Negative'
        else:
            sentiment = 'Neutral'

        return render_template('sentiment.html',
                               title='Text Analysis',
                               sentiment_scores=scores,
                               sentiment=sentiment)
예제 #54
0
                                                            full_df=features_df,
                                                            previous=True)
        print("Done process thread 2")
        features_df['following_staff_post_id'] = features_df.apply(process_thread,
                                                                   axis=1,
                                                                   full_df=features_df,
                                                                   filter_for_staff=True)
        print("Done process thread 3")
        features_df['previous_staff_post_id'] = features_df.apply(process_thread,
                                                                  axis=1,
                                                                  full_df=features_df,
                                                                  previous=True,
                                                                  filter_for_staff=True)
        print("Done process thread 4")

        # drop unneeded columns for model
        # 'root_post' and 'thread' are the same
        drop_features = ['body', 'root_post']
        features_df = features_df.drop(drop_features, axis=1)

        print('-- Writing data to {} -- '.format(features_location))
        features_df.to_csv(features_location, index=False)

    return features_df


if __name__ == "__main__":
    vader = SentimentIntensityAnalyzer()
    lexicon = Empath()
    main()
def get_sentiments(text):
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)
예제 #56
0
import json

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np

nltk.download('vader_lexicon')

pointMachine = SentimentIntensityAnalyzer()


def sentiment_analyzer_scores(x):
    score = pointMachine.polarity_scores(x)
    return score


def create_dataframe():
    my_dataframe = pd.read_json('data/results.json', orient='records')
    my_dataframe['sentiment'] = my_dataframe['tweet'].apply(
        lambda x: 'NaN' if pd.isnull(x) else sentiment_analyzer_scores(x))
    my_dataframe['num'] = my_dataframe['sentiment'].apply(
        lambda score_dict: score_dict['compound'])
    my_dataframe['num_results'] = my_dataframe['num'].apply(
        lambda c: 'positive' if c > 0.05 else ('negative'
                                               if c < -0.05 else 'neutral'))
    process_json = my_dataframe.to_json(orient='index')
    parsed = json.loads(process_json)
    parsed = json.dumps(parsed)
    return parsed
예제 #57
0
class Application(Frame):
    def __init__(self, master=None):
        """This class creates a window with two frames."""
        Frame.__init__(self, master)
        self.grid()
        self.master.title("Reddit conversation sentiment.")

        # Configuring the window with rows and columns.
        for r in range(6):
            self.master.rowconfigure(r, weight=1)
        for c in range(6):
            self.master.columnconfigure(c, weight=1)

        # Creates two frames
        self.frame1 = Frame(master, height=100, width=300, bg="white")
        self.frame1.grid(row=0,
                         column=0,
                         rowspan=6,
                         columnspan=3,
                         sticky=W + E + N + S)
        self.frame2 = Frame(master, height=100, width=150)
        self.frame2.grid(row=0,
                         column=3,
                         rowspan=6,
                         columnspan=3,
                         sticky=W + E + N + S)

        # Creates labels that explain the different filters
        Label(self.frame2,
              text="Min. number of participants:").grid(column=0,
                                                        row=0,
                                                        sticky='w')
        Label(self.frame2,
              text="Max. number of participants:").grid(column=0,
                                                        row=1,
                                                        sticky='w')
        Label(self.frame2,
              text="Min. length of conversation:").grid(column=0,
                                                        row=2,
                                                        sticky='w')
        Label(self.frame2,
              text="Max. length of conversation:").grid(column=0,
                                                        row=3,
                                                        sticky='w')
        Label(self.frame2,
              text="Min. response time in minutes:").grid(column=0,
                                                          row=4,
                                                          sticky='w')
        Label(self.frame2,
              text="Max. response time in minutes:").grid(column=0,
                                                          row=5,
                                                          sticky='w')
        Label(self.frame2, text="Sentiment:").grid(column=0, row=6, sticky='w')

        # Create update button
        Button(self.frame2, text="Update", command=self.start).grid(column=1,
                                                                    row=12)

        # Create filter buttons
        self.min_participants = StringVar(self.frame2)
        self.min_participants.set("2")
        self.opt = OptionMenu(self.frame2, self.min_participants, "2", "3",
                              "4", "5", "6", "7", "8", "9",
                              "10").grid(column=1, row=0)

        self.max_participants = StringVar(self.frame2)
        self.max_participants.set("10")
        self.opt2 = OptionMenu(self.frame2, self.max_participants, "2", "3",
                               "4", "5", "6", "7", "8", "9",
                               "10").grid(column=1, row=1)

        # Create entry for min comments
        self.min_comments = Entry(self.frame2, width=6)
        self.min_comments.insert(0, "2")
        self.min_comments.grid(column=1, row=2)

        # Create entry for max comments
        self.max_comments = Entry(self.frame2, width=6)
        self.max_comments.insert(10, "10")
        self.max_comments.grid(column=1, row=3)

        # Creates spinbox for min time
        self.time_value = StringVar(self.frame2)
        self.time_value.set("0")
        self.min_time = Spinbox(self.frame2,
                                from_=0,
                                to=2880,
                                increment=1,
                                textvariable=self.time_value,
                                width=4).grid(column=1, row=4)

        # Creates spinbox for max time
        self.time_value2 = IntVar(self.frame2)
        self.time_value2.set("2880")
        self.max_time = Spinbox(self.frame2,
                                from_=-0,
                                to=2880,
                                increment=1,
                                textvariable=self.time_value2,
                                width=4).grid(column=1, row=5)

        # Creates an option menu where you can pick the sentiment you want
        self.sentiment = StringVar(self.frame2)
        self.sentiment.set("Neutral")
        self.opt3 = OptionMenu(self.frame2, self.sentiment, "Neutral",
                               "Positive", "Negative").grid(column=1, row=6)

        # Create a menu instance
        menu = Menu(self.master)
        self.master.config(menu=menu)

        # Create the file objects
        menu_file = Menu(menu)
        menu.add_cascade(menu=menu_file, label='File')
        menu_file.add_command(label='Open', command=self.load)
        menu_file.add_command(label='Exit', command=self.master.quit)

        menu_edit = Menu(menu)
        menu.add_cascade(menu=menu_edit, label='Search')
        menu_edit.add_command(label='Search username',
                              command=self.search_username)

        menu_3 = Menu(menu)
        menu.add_cascade(menu=menu_3, label='Credentials')
        menu_3.add_command(label='Reset', command=self.reset)
        menu_3.add_command(label='Change', command=self.newcredentials)

        # Adds a seperator between menufiles.
        menu_file.add_separator()

        # Creates text in the window with a scrollbar wich can be edited .
        self.S = Scrollbar(self.frame1)
        self.T = Text(self.frame1)
        self.S.pack(side=RIGHT, fill=Y)
        self.T.pack(side=LEFT, fill=Y)
        self.S.config(command=self.T.yview)
        self.T.config(yscrollcommand=self.S.set)

        # Logging in on Reddit.
        self.username = "******"
        self.password = "******"
        self.client_id = "CLIENT_ID"
        self.client_secret = "CLIENT_SECRET"
        self.bot_login()
        self.file = None
        self.sid = SentimentIntensityAnalyzer()

    def newcredentials(self):
        """This function asks username, password, client id and clientsecret. It then writes the answers to credentials.txt."""
        self.username = simpledialog.askstring("Input",
                                               "What is the username?")
        self.password = simpledialog.askstring("Input",
                                               "What is the password?")
        self.client_id = simpledialog.askstring("Input",
                                                "What is the client id?")
        self.client_secret = simpledialog.askstring(
            "Input", "What is the client secret?")
        self.bot_login()

    def reset(self):
        """This function rewrites credentials.txt with the lines of the back-up.txt."""
        self.username = "******"
        self.password = "******"
        self.client_id = "_J4D2EctRUBJvg"
        self.client_secret = "Xx0CvhFNIHhA_d5AfmDhx8sEYOw"
        self.bot_login()

    def bot_login(self):
        """Reads every line from credentials.txt and logs in reddit praw."""
        self.r = praw.Reddit(username=self.username,
                             password=self.password,
                             client_id=self.client_id,
                             client_secret=self.client_secret,
                             user_agent="Human Interaction IK bot")

        try:
            self.r.user.me()
        except:
            self.T.delete(1.0, END)
            print(
                "Your credentials are invalid. Please fill in the correct credentials."
            )
            self.T.insert(
                END,
                "Your credentials are invalid. Please fill in the correct credentials."
            )

    def filterinput(self, data):
        """Checks which filters are set and passes
        the remaining conversations to the print function.
        It checks for number of participants, length of the
        conversation, amount of time in between and sentiment."""
        try:
            min_participants = self.min_participants.get()
            max_participants = self.max_participants.get()
            min_comments = self.min_comments.get()
            max_comments = self.max_comments.get()
            sentiment = self.sentiment.get()
            min_time = int(self.time_value.get()) * 60
            max_time = int(self.time_value2.get()) * 60
            count = 1
            conv_lst = []
            new_conv = []
            for user_dic in self.file:
                conversation = user_dic["Conversation"]
                participants = user_dic["Participants"]
                if (len(conversation) >= int(min_comments)
                        and len(conversation) <= int(max_comments)
                        and participants >= int(min_participants)
                        and participants <= int(max_participants)):
                    sent_analyze = self.sentimentfinder(conversation)
                    if sent_analyze == sentiment:
                        prev_timestamp = False
                        for dic in conversation:
                            datetimeFormat = '%Y-%m-%d %H:%M:%S'
                            conv = [dic]
                            conv.sort(key=lambda x: x["Timestamp"])
                            comment = self.r.comment(id=dic["Comment"])
                            timestamp = str(
                                datetime.datetime.fromtimestamp(
                                    comment.created_utc))
                            count += 1
                            if prev_timestamp:
                                diff = datetime.datetime.strptime(
                                    timestamp, datetimeFormat
                                ) - datetime.datetime.strptime(
                                    prev_timestamp, datetimeFormat)
                                if min_time < diff.seconds and diff.seconds < max_time:
                                    new_conv.append(dic)
                            prev_timestamp = timestamp
                        conv_lst.append({
                            "User": self.user,
                            "Conversation": new_conv,
                            "Participants": participants
                        })
                        new_conv = []
            self.print_by_order(conv_lst)
            self.frame2.config(state='normal', text="Filter")
        except TclError:
            pass

    def sentimentfinder(self, lst):
        """Finds the sentiment of a conversation and returns it."""
        oldsentiment = 0
        message = []
        sentimentset = set()
        for dic in lst:
            comment = self.r.comment(id=dic["Comment"])
            scores = self.sid.polarity_scores(comment.body)
            newsentiment = scores['compound']
            if oldsentiment > newsentiment:
                sentimentset.add('Negative')
            elif newsentiment > oldsentiment:
                sentimentset.add('Positive')
            else:
                sentimentset.add('Neutral')
            oldsentiment = newsentiment
            if len(sentimentset) == 1:
                return list(sentimentset)[0]

    def load(self):
        """This function allows the user to load in json files."""
        file_path = filedialog.askopenfilename(title="Select file",
                                               filetypes=[('JSON file',
                                                           '.json')])
        if file_path:
            self.T.delete(1.0, END)
            with open(file_path, encoding='utf-8') as f:
                data = json.loads(f.read())
                self.print_by_order(data)
        self.file = data

    def start(self):
        """This function allows threading so the program is going to freeze."""
        threading.Thread(target=self.get_comments, daemon=True).start()

    def search_username(self):
        """This function allows the user to look for a different username."""
        check = True
        while check:
            try:
                self.user = simpledialog.askstring(
                    "Input", "Please insert a username?")
                check = False
            except:
                print("The window glitched. Please try again.")
                pass
        self.start()

    def get_comments(self):
        """This function searches for all the comments made by the given username.
        It makes dictionaries as following:
        {User, Participants, Conversation: {Timestamp, Comment, User}}"""
        check = True
        while check:
            if not self.file:
                try:
                    self.user = simpledialog.askstring(
                        "Input", "Please insert a username?")
                    if self.user:
                        check = False
                except:
                    print("The window glitched. Please try again.")
                    pass
            else:
                break
        conv_lst = []
        count = 0
        try:
            self.r.redditor(self.user).fullname
            author_lst = []
            for comment in self.r.redditor(self.user).comments.new(limit=100):
                count += 1
                conversation = []
                ancestor = comment
                refresh_counter = 0
                while not ancestor.is_root:
                    ancestor = ancestor.parent()
                    if refresh_counter % 9 == 0:
                        ancestor.refresh()
                    refresh_counter += 1
                if 2 < len(ancestor.replies) < 10:
                    author_lst.append(ancestor.author)
                    if ancestor.author:
                        conversation.append({
                            "Timestamp": ancestor.created_utc,
                            "Comment": str(ancestor.id),
                            "User": ancestor.author.name
                        })
                    else:
                        conversation.append({
                            "Timestamp": ancestor.created_utc,
                            "Comment": str(ancestor.id),
                            "User": "******"
                        })
                    for reply in ancestor.replies:
                        try:
                            author_lst.append(reply.author)
                            conversation.append({
                                "Timestamp": reply.created_utc,
                                "Comment": str(reply.id),
                                "User": reply.author.name
                            })
                        except:
                            pass
                    conversation = [
                        dict(tupleized) for tupleized in set(
                            tuple(sorted(item.items()))
                            for item in conversation)
                    ]
                    author_lst = list(set(author_lst))
                    conv_dic = {
                        "User": self.user,
                        "Conversation": conversation,
                        "Participants": len(author_lst)
                    }
                    conv_lst.append(conv_dic)
                    conversation = []
                    author_lst = []
            self.file = conv_lst
            self.filterinput(conv_lst)
        except:
            print("This username is not available.")

    def print_by_order(self, data):
        """This function inserts the conversations in the correct order into the left frame.
        The correct order is determined by the timestamp."""
        self.T.delete(1.0, END)
        try:
            dic = data[0]
            self.user = dic["User"]
            conv_file = open(self.user + '.json', 'w')
            json.dump(data, conv_file)
            self.file = data
            for user_dic in data:
                count = 0
                conversation = user_dic["Conversation"]
                prev_timestamp = False
                for dic in conversation:
                    comment = self.r.comment(id=dic["Comment"])
                    timestamp = str(
                        datetime.datetime.fromtimestamp(comment.created_utc))
                    count += 1
                    datetimeFormat = '%Y-%m-%d %H:%M:%S'
                    if prev_timestamp:
                        diff = abs(
                            datetime.datetime.strptime(timestamp,
                                                       datetimeFormat) -
                            datetime.datetime.strptime(prev_timestamp,
                                                       datetimeFormat))
                        print("({0}) {1} {2}:\t{3}\n".format(
                            count, diff, comment.author,
                            comment.body.split('\n', 1)[0]))
                        self.T.insert(
                            END, "({0}) {1} {2}:\t{3}\n".format(
                                count, diff, comment.author,
                                comment.body.split('\n', 1)[0]))
                    else:
                        print("({0}{1:<9} {2}:\t{3}\n".format(
                            count, ")", comment.author,
                            comment.body.split('\n', 1)[0]))
                        self.T.insert(
                            END, "({0}{1:<9} {2}:\t{3}{2}\n".format(
                                count, ")", comment.author,
                                comment.body.split('\n', 1)[0]))
                    prev_timestamp = timestamp
                self.T.insert(END, "\n")
            conv_file.close()
        except:
            self.T.insert(END, "No conversations were found.")
            print("No conversations were found")
예제 #58
0
data_all = json_data['data']['children']
num_of_posts = 0
while len(data_all) <= 100:
    time.sleep(2)
    last = data_all[-1]['data']['name']
    url = 'https://www.reddit.com/r/politics/.json?after=' + str(last)
    req = requests.get(url, headers=hdr)
    data = json.loads(req.text)
    data_all += data['data']['children']
    if num_of_posts == len(data_all):
        break
    else:
        num_of_posts = len(data_all)

sia = SIA()
pos_list = []
neg_list = []
for post in data_all:
    res = sia.polarity_scores(post['data']['title'])
    # Uncomment the following line to get the polarity results
    # for each post as shown in article's image
    # print(res)
    if res['compound'] > 0.2:
        pos_list.append(post['data']['title'])
    elif res['compound'] < -0.2:
        neg_list.append(post['data']['title'])

with open("pos_news_titles.txt", "w", encoding='utf-8',
          errors='ignore') as f_pos:
    for post in pos_list:
예제 #59
0
def search_user(user, count):

    try:
        user_timeline = twitter.get_user_timeline(screen_name=user,
                                                  count=count,
                                                  tweet_mode='extended')
        print('Tweets succesfully retrieved!')
    except:
        raise ValueError('Tweets could not be retrived!')

    ################################################################################
    # tweets = tw.Cursor(api.search,
    #                    q=search_words,
    #                    lang="en",
    #                    since=date_since).items(50)

    # Create JSON-file from tweets
    print('Creating JSON file...')
    filename = user + '.json'

    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(user_timeline, file, sort_keys=True, indent=4)

    print('{} created'.format(filename))

    print(type(filename))
    tweet_file = filename

    # Read JSON-file
    print("Reading the file...")
    tweets_2 = read_json(tweet_file)
    print(type(tweets_2))
    # tweets_2.items()
    print(type(tweets_2))

    # initialize tweeter tokenizer
    tTokenizer = TweetTokenizer()

    # initialize NLTK sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # initialize cumulator for avg.
    tss1 = 0.0
    tss2 = 0.0

    # Sentiment analysis
    print("Analyzing...")

    print(
        """--------------------------------------------------------------------------------------------------------------------------------
    TextBlob | NLTK  |                                    Tweet text                            | Sentences | Words  | Unique Words
    --------------------------------------------------------------------------------------------------------------------------------"""
    )

    for tweet in tweets_2:
        # get text from tweets
        if tweet['truncated']:
            line = tweet['extended_tweet']['full_text']
        elif 'text' in tweet:
            line = tweet['text']
        else:
            line = tweet['full_text']

        # delete Leading & trailing characters
        line = line.strip()
        # Remove links
        line = remove_links(line)

        # Remove new line
        line = line.replace('\n', ' ')

        # Remove Non-ascii
        line = NormalizeText.remove_nonascii(line)

        # Tokenize text
        tweet_sent = sent_tokenize(line)  # Tokenize sentences
        tweet_word = tTokenizer.tokenize(line)
        tweet_unique = list(set(tweet_word))  # Eliminate duplicated words

        # Analyse sentiment
        ss1 = TextBlob(line)
        ss2 = sid.polarity_scores(line)

        # Update Cumulators
        tss1 += ss1.sentiment.polarity
        tss2 += ss2['compound']

        # Add updates to the JSON-file
        tweet.update({
            'sentiment': {
                'textblob': ss1.sentiment.polarity,
                'nltk': ss2['compound']
            }
        })

        # Show results
        print('{:8.2f} | {:5.2f} | {:72.72} | {:9d} | {:6d} | {:12d}'.format(
            ss1.sentiment.polarity, ss2['compound'], line, len(tweet_sent),
            len(tweet_word), len(tweet_unique)))

    print('\n-------------------------')
    print('Sentiment analysis summary:')
    print('TextBlob Average {:.2f}'.format(tss1 / len(tweets_2)))
    print('NLTK Average {:.2f}'.format(tss2 / len(tweets_2)))

    # Update JSON file
    print('\n')
    print('*****************')
    print('Updating JSON file...')
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(tweets_2, file, sort_keys=True, indent=4)
    print('File {} updated. Process complete!'.format(filename))
    return tweets_2
    file.close()
예제 #60
0
파일: nlp.py 프로젝트: dctisdal/482-Chatbot
 def __init__(self):
     self.analyzer = SentimentIntensityAnalyzer()