def input(): req = json.loads(request.data.decode()) try: try: u = db_session.query(User).filter_by(uid=str(req['uid'])).one() except NoResultFound: u = User(uid=req['uid'], screen_name=req['screen_name'], followers_count=req['followers_count'], friends_count=req['friends_count'], statuses_count=req['statuses_count'], rank=req['rank']) db_session.add(u) db_session.commit() tw = Item(message=req['message'], contestant=req['contestant'], item_id=req['item_id'], group_item_id=req['group_item_id'], # for expanded url item_type=req['item_type'], item_url=req['item_url'], location=req['location'], date=req['date'], # all time is stored at UTC source=req['source'], sentiment=req['sentiment'], sentiment_textblob=req['sentiment_textblob'], sentiment_bayes=req['sentiment_bayes'], polarity=req['polarity'], # tbc subjectivity=req['subjectivity'], # tbc favorite_count=req['favorite_count'], share_count=req['share_count'], user_id=u.id, verified_user=req['verified_user'], team=req['team'], data=req['data']) db_session.add(tw) db_session.commit() # words for w in req['words']: if len(w)>100: continue w_obj = Word(word=w) db_session.add(w_obj) db_session.commit() tw.words.append(w_obj) u.words.append(w_obj) # hashtags for t in req['hashtags']: if len(t)>100: continue t_obj = Hashtag(hashtag=t) db_session.add(t_obj) db_session.commit() tw.hashtags.append(t_obj) u.hashtags.append(t_obj) # url if req['expanded_url'] and len(req['expanded_url'])<200: url = Url(item_id=req['item_id'], url=req['expanded_url']) db_session.add(url) db_session.commit() except OperationalError: db_session.rollback() print (tw.id) return str(tw.id)
def on_data(self, data): # sometimes tweepy send NoneType objects if data is None: return # decode json dict_data = json.loads(data) if "user" not in dict_data: print ("invalid format: no user found, skip." + "\n") return # check if retweet/quote/retweet-quote retweeted_quoted_status = False retweeted_status = bool("retweeted_status" in dict_data) if retweeted_status: retweeted_quoted_status = bool("quoted_status" in dict_data["retweeted_status"]) quoted_status = bool("quoted_status" in dict_data) # check if duplication and skip tweets with users with less than req threshold if retweeted_quoted_status: if dict_data["retweeted_status"]["quoted_status"]["user"]["followers_count"]<MIN_FOLLOWERS or dict_data["retweeted_status"]["quoted_status"]["user"]["friends_count"]<MIN_FRIENDS: print ("less than user metric threshold for storage, skip." + "\n") return print ('retweeted_quoted_status') key_fields = dict_data["retweeted_status"]["quoted_status"] elif retweeted_status: if dict_data["retweeted_status"]["user"]["followers_count"]<MIN_FOLLOWERS or dict_data["retweeted_status"]["user"]["friends_count"]<MIN_FRIENDS: print ("less than user metric threshold for storage, skip." + "\n") return print ('retweeted_status') key_fields = dict_data["retweeted_status"] elif quoted_status: if dict_data["quoted_status"]["user"]["followers_count"]<MIN_FOLLOWERS or dict_data["quoted_status"]["user"]["friends_count"]<MIN_FRIENDS: print ("less than user metric threshold for storage, skip." + "\n") return print ('quoted_status') key_fields = dict_data["quoted_status"] else: if dict_data["user"]["followers_count"]<MIN_FOLLOWERS or dict_data["user"]["friends_count"]<MIN_FRIENDS: print ("less than user metric threshold for storage, skip." + "\n") return print ('normal_status') key_fields = dict_data id_str = key_fields["id_str"] tweet = key_fields["text"] if retweeted_quoted_status or retweeted_status: favorite_count = dict_data["retweeted_status"]["favorite_count"] share_count = dict_data["retweeted_status"]["retweet_count"] else: favorite_count = key_fields["favorite_count"] share_count = key_fields["retweet_count"] user_id = key_fields["user"]["id"] screen_name = key_fields["user"]["screen_name"] location = key_fields["user"]["location"] followers_count = key_fields["user"]["followers_count"] friends_count = key_fields["user"]["friends_count"] statuses_count = key_fields["user"]["statuses_count"] date = time.strftime('%Y-%m-%dT%H:%M:%S', time.strptime(key_fields["created_at"],'%a %b %d %H:%M:%S +0000 %Y')) item_url = 'https://twitter.com/' + str(screen_name) + '/status/' + str(id_str) # minumum threshold by elapsed time timedelta = datetime.datetime.utcnow()-datetime.datetime.strptime(date,'%Y-%m-%dT%H:%M:%S') minutes_elapsed = td_to_minutes(timedelta) if minutes_elapsed<20: minimum_rt = 3*minutes_elapsed elif minutes_elapsed<120: minimum_rt = 80*math.log(minutes_elapsed)-180 else: minimum_rt = 233 important = 'f' if share_count>minimum_rt: important = 't' # check if item already stored try: record = db_session.query(Item).filter(Item.item_id==id_str).one() if quoted_status or retweeted_quoted_status: # quoted tweets contain no new data print ('Quoted / Retweeted_quoted tweet caught. No update. ID: ' + str(id_str) + '\n') return record.favorite_count=favorite_count record.share_count=share_count db_session.commit() # publish to pubnub pubnub_object = ({'sentiment': record.sentiment, 'group_item_id': record.group_item_id, 'item_id': id_str, 'source': 'twitter', 'favorite_count': favorite_count, 'share_count': share_count, 'contestant': record.contestant, 'item_url': item_url, 'date': date, 'important': important}) pubnub.publish(channel='pubnub-sng',message=pubnub_object) print ('Retweet caught. Updated favorite and share count record. ID: ' + str(id_str) + '\n') return except NoResultFound: pass # queue storing previous 500 tweet texts for duplication check if tweet[:20] in queue: print ('Tweet already processed, repeat found in queue, skip.' + '\n') return queue.append(tweet[:20]) if len(queue)>500: queue.popleft() # preprocessing tweet in nlp_textblob try: expanded_url = str(dict_data["entities"]["urls"][0]["expanded_url"]).lower() except IndexError: expanded_url = None tweet_dict = analyse_tweet(tweet,expanded_url) if tweet_dict['contestant'] is None: print ('CONTESTANT NOT FOUND' + '\n') return words = clean_words(tweet_dict['tb_words'],tweet_dict['hashtags'],tweet_dict['shoutouts']) # distant supervised tweet classification bayes probability sentiment_bayes = accumulate_prob(hash_wordcount,words) # for group_item_id url_record = db_session.query(Url).filter(Url.url==expanded_url).first() try: group_item_id = url_record.item_id except AttributeError: group_item_id = id_str # check tweet is from verified candidate twitter account if str(screen_name) in CANDIDATE_USERNAMES[tweet_dict['contestant']]['UserName']: print ('VERIFIED USER') verified = True important = 't' else: verified = False # publish to pubnub pubnub_object = ({'sentiment': tweet_dict['sentiment'], 'group_item_id': group_item_id, 'item_id': id_str, 'source': 'twitter', 'favorite_count': favorite_count, 'share_count': share_count, 'contestant': tweet_dict['contestant'], 'item_url': item_url, 'date': date, 'important': important}) pubnub.publish(channel='pubnub-sng',message=pubnub_object) # select correct fields if "media" not in dict_data["entities"]: item_type = "text" else: if dict_data["entities"]["media"][0]["type"] == "photo": item_type = "image" else: item_type = dict_data["entities"]["media"][0]["type"] # output key fields print (str(screen_name) + ' My score: ' + str(tweet_dict['sentiment']) + ' TB score: ' + tweet_dict['tb_sentiment'] + ' Bayes score: ' + sentiment_bayes) print ('Tweet ID: ' + str(id_str) + ' ' + str(minutes_elapsed) + ' minutes ago') print ('Friends Count: ' + str(friends_count) + ' Followers Count: ' + str(followers_count)) print ('Retweet Count: ' + str(share_count) + ' Favorite Count: ' + str(favorite_count)) print (str(tweet)) print (item_url) # send to server item_data = ({'uid':user_id, 'screen_name':screen_name, 'followers_count':followers_count, 'friends_count':friends_count, 'statuses_count':statuses_count, 'rank': 1, 'message': tweet, 'contestant': tweet_dict['contestant'], 'item_id': id_str, 'group_item_id': group_item_id, 'item_type': item_type, 'item_url': item_url, 'location': location, 'date': date, 'source': "twitter", 'sentiment': tweet_dict['sentiment'], 'sentiment_textblob': tweet_dict['tb_sentiment'], 'sentiment_bayes': sentiment_bayes, 'polarity': tweet_dict['tb_polarity'], 'subjectivity': tweet_dict['tb_subjectivity'], 'favorite_count': favorite_count, 'share_count': share_count, 'verified_user': verified, 'team': tweet_dict['team'], 'data': json.dumps(data), 'words': words, 'hashtags': tweet_dict['hashtags'], 'expanded_url': expanded_url}) headers = {'Content-type': 'application/json', 'Accept': 'text/plain'} r = requests.post(URL + 'api/input_data/', data=json.dumps(item_data), headers=headers) print (r) print(r.text + '\n') return True