def on_data(self, raw_data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ data = json.loads(HTMLParser().unescape(raw_data)) if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) if self.on_status(status) is False: return False elif 'delete' in data: delete = data['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'event' in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif 'direct_message' in data: status = Status.parse(self.api, data) if self.on_direct_message(status) is False: return False elif 'limit' in data: if self.on_limit(data['limit']['track']) is False: return False elif 'disconnect' in data: if self.on_disconnect(data['disconnect']) is False: return False else: logging.error("Unknown message type: " + str(raw_data))
def process(self, tweet): status = Status.parse(api, json.loads(tweet)) for lf in UNICODE_LINES: text = status.text.replace(lf, ' ') print "@%s (%s, %s, %s, %s): %s"%(status.user.screen_name, status.user.lang, status.user.statuses_count, status.user.friends_count, status.user.followers_count, text)
def on_data(self, data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ if '{"delete"' in data: try: delete = json.loads(data)['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False except: delete = json.loads(data)['delete']['direct_message'] if self.on_direct_message_delete(delete['id'], delete['user_id']) is False: return False elif '{"direct_message"' in data: message = DirectMessage.parse(self.api, json.loads(data)['direct_message']) if self.on_direct_message(message) is False: return False elif '{"target"' in data: event = json.loads(data) if self.on_event(event) is False: return False elif '{"limit"' in data: if self.on_limit(json.loads(data)['limit']['track']) is False: return False elif '"in_reply_to_user_id_str"' in data: status = Status.parse(self.api, json.loads(data)) if self.on_status(status) is False: return False
def test_end_to_end(filename, connections, expected, tmpdir): api = MockAPI(connections=connections) with open(filename, 'r') as f: status = Status.parse(api, json.load(fp=f)) l = LessListener(api=api, post_replies=True, gather='tweets', state_dir=str(tmpdir)) # 100% festivity for all of December l.december_greetings = ('It is cold outside.',) l.festive_probability = 1. assert l.get_festive_probability(dt.date(2016, 12, 5)) == 1. l.on_status(status) # Never reply to the same toot twice l.on_status(status) # Rate-limit replies for same word setattr(status, 'id', status.id + 1) l.on_status(status) if expected is None: assert api._updates == [] else: assert len(api._updates) == 1 u = api._updates[0] assert u['status'] == expected for k, before in connections.items(): after = api._connections[k] assert ('following' in after) == ('followed_by' in before), \ (k, before, after)
def on_data(self, data): if time.time() >= self.started + self.duration: stats = open('{0}-sample.stats'.format(int(self.started)), 'w+') stats.write("================= STATISTICS =================" + "\n") stats.write("Start time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n") stats.write("End time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n") stats.write("First Tweet ID: " + self.first_tweet_id + "\n") stats.write("Last Tweet ID: " + self.last_tweet_id + "\n") stats.write("Language: " + self.lang + "\n") stats.write("Language classification threshold: " + str(self.lang_threshold) + "\n") stats.write("Above threshold: " + str(self.counter[self.lang + '-above']) + "\n") stats.write("Below threshold: " + str(self.counter[self.lang + '-below']) + "\n") stats.write("Exluded: " + str(self.counter['excluded']) + "\n") return False elif 'in_reply_to_status_id' in data: status = Status.parse(self.api, json.loads(data)) langclass = langid.classify(status.text) if (self.counter == {self.lang + '-above':0, self.lang + '-below':0, 'excluded':0}): self.first_tweet_id = str(status.id) self.last_tweet_id = str(status.id) if (langclass[0] == self.lang): if langclass[1] >= self.lang_threshold: self.above_output.write(data) self.counter[self.lang + '-above'] += 1 else: self.below_output.write(data) self.counter[self.lang + '-below'] += 1 else: self.excl_output.write(data) self.counter['excluded'] += 1 return True
def on_data(self, data): if "entities" in data: data = json.loads(data) user_mentions = data["entities"]["user_mentions"] screen_names = [mention["screen_name"] for mention in user_mentions] if "testeMagazine" in screen_names: status = Tweet.parse(self.api, data) self.on_mention(status)
def on_data(self, raw_data): """Called when raw data is received from connection. This is where all the data comes first. Normally we could use (inherit) the on_data() in tweepy.StreamListener, but it unnecessarily and naively reports unknown event types as errors (to simple log); also, we might want to tweak it further later on. But for now, this is basically taken from tweepy's on_data(). Return False to stop stream and close connection. """ self.processing_data = True data = json.loads(raw_data) if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) if self.on_status(status) is False: return False elif 'delete' in data: delete = data['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'event' in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif 'direct_message' in data: status = Status.parse(self.api, data) if self.on_direct_message(status) is False: return False elif 'limit' in data: if self.on_limit(data['limit']['track']) is False: return False elif 'disconnect' in data: if self.on_disconnect(data['disconnect']) is False: return False else: log.debug('TwitterBotStreamListener::on_data(): got event/stream data of' ' unknown type. Raw data follows:\n%s', data) self.processing_data = False
def test_sanitize(filename, expected): api = NonCallableMock() with open(os.path.join('tests', filename), 'r') as f: status = Status.parse(api, json.load(f)) text = get_sanitized_text(status) assert '&' not in text assert 'http' not in text assert text == expected
def _read_from_table(self): self.running = True conn = StatusSource.engine.connect() meta = MetaData() table = Table(self.table_name, meta, autoload=True, autoload_with=StatusSource.engine) cmd = select([table]) results = conn.execute(cmd) for result in results: status = Status.parse(None, result) self.listener.on_status(status) if self.running == False: break
def on_data(self, raw_data): # called on recieval of raw data data = json.loads(raw_data) # start of if tree if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) if self.on_status(status) is False: return False elif 'delete' in data: delete = data['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'event' in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif 'direct_message' in data: status = Status.parse(self.api, data) if self.on_direct_message(status) is False: return False
def test_save_tweet(tmpdir, id_, expected_filename): api = MockAPI(connections={}) foo = tmpdir.join('foo') l = LessListener(api=api, gather=str(foo), state_dir=str(tmpdir)) s = Status.parse(api=api, json={ 'id': int(id_), 'id_str': id_, }) l.save_tweet(s) j = tmpdir.join('foo', expected_filename) assert j.check()
def test_patched_status(self): """@todo: Docstring for test_patched_status. :returns: @todo """ from tweepy.models import Status from crawler.tweepy_patch import patch patch() s = Status.parse('test_api', {'a': 1, 'b': 2}) # pylint: disable=E1101,W0212 self.assertEqual(s._raw, '{"a": 1, "b": 2}') self.assertEqual(s.a, 1) self.assertEqual(s.b, 2)
def on_data(self, raw_data): data = json.loads(raw_data) if self.verbose: print data print '-'*60 if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) if self.on_status(status) is False: return False elif 'event' in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif 'friends' in data: pass # ignore elif 'delete' in data: pass # ignore elif 'user_suspend' in data: pass # ignore else: logging.error("Unknown message type: " + str(raw_data))
def on_data(self, data): if 'in_reply_to_status_id' in data: status = Status.parse(self.api, json.loads(data)) if self.on_status(status, data) is False: return False elif 'delete' in data: delete = json.loads(data)['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'limit' in data: if self.on_limit(json.loads(data)['limit']['track']) is False: return False
def save_status(self, data): """TODO""" status = Status.parse(self.api, json.loads(data)) if not status.geo: # _datafile.write(data+'\n') return if Author.objects.filter(owner__userprofile__twitter_id=status.user.id_str).exists(): # this tweet's author is on stargazer return try: author = Author.objects.filter(source=Author.T_TWITTER, external_id=status.user.id_str).get() except Author.DoesNotExist: author = Author( name=status.user.screen_name, avatar_uri=status.user.profile_image_url, source=Author.T_TWITTER, external_id=status.user.id_str, ) author.save() try: post = Post.objects.filter(source=Post.T_TWITTER, external_id=status.id_str).get() except Post.DoesNotExist: lat = float(status.geo["coordinates"][0]) lng = float(status.geo["coordinates"][1]) try: addr = self._latlng2addr.get(lat, lng) except (LatLng2Addr.ConnectionFailed, LatLng2Addr.GeocodingFailed) as e: addr = "" # twitter api response in UTC created = status.created_at + timedelta(hours=8) post = Post( content=status.text, author=author, latitude=lat, longitude=lng, address=addr, source=Post.T_TWITTER, external_id=status.id_str, external_data=data, created=created, ) post.save() return
def on_data(self, raw_data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ data = json.loads(raw_data) if "in_reply_to_status_id" in data: status = Status.parse(self.api, data) if self.on_status(status) is False: return False elif "delete" in data: delete = data["delete"]["status"] if self.on_delete(delete["id"], delete["user_id"]) is False: return False elif "event" in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif "direct_message" in data: status = Status.parse(self.api, data) if self.on_direct_message(status) is False: return False elif "friends" in data: if self.on_friends(data["friends"]) is False: return False elif "limit" in data: if self.on_limit(data["limit"]["track"]) is False: return False elif "disconnect" in data: if self.on_disconnect(data["disconnect"]) is False: return False elif "warning" in data: if self.on_warning(data["warning"]) is False: return False else: logging.error("Unknown message type: " + str(raw_data))
def get_place(status: Status) -> dict: place: dict = { "coordinates": [], "country": "N/A", "country_code": "N/A", "full_name": "N/A" } try: if status.__getattribute__("place") is not None: status_place: dict = status.__getattribute__("place").__dict__ coordinates: list = status_place.get( "bounding_box").__dict__.get("coordinates", []) country: str = status_place.get("country", "N/A") country_code: str = status_place.get("country_code", "N/A") full_name: str = status_place.get("full_name", "N/A") place: dict = { "coordinates": coordinates, "country": country, "country_code": country_code, "full_name": full_name } except Exception as e: pass return place
def on_data(self, raw_data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ from tweepy.models import Status data = json.loads(raw_data) if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) if self.on_status(status) is False: return False elif 'delete' in data: delete = data['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'event' in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif 'direct_message' in data: status = Status.parse(self.api, data) if self.on_direct_message(status) is False: return False elif 'friends' in data: if self.on_friends(data['friends']) is False: return False elif 'limit' in data: if self.on_limit(data['limit']['track']) is False: return False elif 'disconnect' in data: if self.on_disconnect(data['disconnect']) is False: return False else: self.bot._log("Unknown message type: " + str(raw_data))
def on_data(self, data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ if 'in_reply_to_status_id' in data: status = Status.parse(self.api, json.loads(data)) if self.on_status(status) is False: return False elif 'delete' in data: delete = json.loads(data)['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'limit' in data: if self.on_limit(json.loads(data)['limit']['track']) is False: return False
def __init__(self, tweetDict): self.tweet = Status.parse(API(), tweetDict["tweet"]) try: self.keywords = tweetDict["keywords"] except KeyError: pass try: self.groups = tweetDict["groups"] except KeyError: pass self.tokens = [] self.filt_tokens = [] for token in tweetDict["tokens"]: t = Token(token) self.tokens.append(t) if not t.filter_token(): self.filt_tokens.append(t)
def on_data(self, data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ if "in_reply_to_status_id" in data: status = Status.parse(self.api, json.loads(data)) if self.on_status(status) is False: return False elif "delete" in data: delete = json.loads(data)["delete"]["status"] if self.on_delete(delete["id"], delete["user_id"]) is False: return False elif "limit" in data: if self.on_limit(json.loads(data)["limit"]["track"]) is False: return False
def on_data(self, data): if time.time() >= self.started + self.duration: stats = open('{0}-sample.stats'.format(int(self.started)), 'w+') stats.write("================= STATISTICS =================" + "\n") stats.write("Start time: " + time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n") stats.write("End time: " + time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n") stats.write("First Tweet ID: " + self.first_tweet_id + "\n") stats.write("Last Tweet ID: " + self.last_tweet_id + "\n") stats.write("Language: " + self.lang + "\n") stats.write("Language classification threshold: " + str(self.lang_threshold) + "\n") stats.write("Above threshold: " + str(self.counter[self.lang + '-above']) + "\n") stats.write("Below threshold: " + str(self.counter[self.lang + '-below']) + "\n") stats.write("Exluded: " + str(self.counter['excluded']) + "\n") return False elif 'in_reply_to_status_id' in data: status = Status.parse(self.api, json.loads(data)) langclass = langid.classify(status.text) if (self.counter == { self.lang + '-above': 0, self.lang + '-below': 0, 'excluded': 0 }): self.first_tweet_id = str(status.id) self.last_tweet_id = str(status.id) if (langclass[0] == self.lang): if langclass[1] >= self.lang_threshold: self.above_output.write(data) self.counter[self.lang + '-above'] += 1 else: self.below_output.write(data) self.counter[self.lang + '-below'] += 1 else: self.excl_output.write(data) self.counter['excluded'] += 1 return True
def process(self, tweet): status = Status.parse(api, json.loads(tweet)) out = {"screen_name": status.user.screen_name, "id": status.id, "lang": status.user.lang, "statuses_count": status.user.statuses_count, "friend_count": status.user.friends_count, "followers_count":status.user.followers_count, "profile_image_url": status.user.profile_image_url, "text": status.text.encode('utf8'), "entities": status.entities, "created_at": status.created_at.strftime("%Y-%m-%d %H:%M:%S"), "geo":status.geo, "location":status.user.location, "timezone":status.user.time_zone} now = time.strftime(self.fmt) if now != self.time: self.time = str(now) self.fid.close() self.fid = gzip.open(os.path.join(self.path, self.base + '-' + self.time + '.txt.gz'), 'ab') self.fid.write(json.dumps(out) + '\n')
def test_sending_images(self): # ensure there is an image as the mock object will not do anything shutil.copy('./image.jpg', '/tmp/image.jpg') client = boto3.client('s3') client.download_file = MagicMock(return_value=None) auth = tweepy.OAuthHandler('foo', 'bar') api = tweepy.API(auth) api.update_with_media = MagicMock(return_value=Status()) tweet_images = TweetS3Images(api, client) tweet_images.send_image('test_bucket', 'image.jpg', cleanup=True) client.download_file.assert_called_with('test_bucket', 'image.jpg', '/tmp/image.jpg') api.update_with_media.assert_called_with( filename='image.jpg', status='New image image.jpg brought to you by lambda-tweet', file=tweet_images.get_file()) self.assertFalse(os.path.exists('/tmp/image-test.jpg'), 'The image was not cleaned up correctly.')
def _get_status(self, data): status = Status.parse(self.api, self.json.loads(data)) if status.user.screen_name in self.block_users: raise TweepError(">> User ignored: @%s" % status.user.screen_name) try: status = status.retweeted_status except AttributeError as atr: if not self.original: text = self._proccess_status(status.text) trunc_text = (text[:72] + '...') if len(text) > 75 else text raise TweepError(">> Original tweet ignored: %s" % trunc_text) if status.is_quote_status: if self.quoted: status = status.quoted_status else: text = self._proccess_status(status.text) raise TweepError(">> Quoted tweet ignored: %s" % text) return status
async def on_data(self, raw_data): """|coroutine| This is called when raw data is received from the stream. This method handles sending the data to other methods, depending on the message type. Parameters ---------- raw_data : JSON The raw data from the stream References ---------- https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/guides/streaming-message-types """ data = json.loads(raw_data) if "in_reply_to_status_id" in data: status = Status.parse(None, data) return await self.on_status(status) if "delete" in data: delete = data["delete"]["status"] return await self.on_delete(delete["id"], delete["user_id"]) if "disconnect" in data: return await self.on_disconnect_message(data["disconnect"]) if "limit" in data: return await self.on_limit(data["limit"]["track"]) if "scrub_geo" in data: return await self.on_scrub_geo(data["scrub_geo"]) if "status_withheld" in data: return await self.on_status_withheld(data["status_withheld"]) if "user_withheld" in data: return await self.on_user_withheld(data["user_withheld"]) if "warning" in data: return await self.on_warning(data["warning"]) log.warning("Received unknown message type: %s", raw_data)
def on_data(self, data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ if 'in_reply_to_status_id' in data: status = Status.parse(self.api, json.loads(data)) return self.on_status(status) elif 'delete' in data: delete = json.loads(data)['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'limit' in data: if self.on_limit(json.loads(data)['limit']['track']) is False: return False elif 'sender_id' in data and 'recipient_id' in data: dm = DirectMessage.parse(self.api, json.loads(data)) return self.on_dm(dm) elif 'event' in data and 'follow' in data: content = json.loads(data) if 'event' in content and content['event'] == 'follow': return self.on_follow(content)
hashtag = 0 url = 0 question = 0 exclamation = 0 pos_term = 0 neg_term = 0 pos_emoticon = 0 neg_emoticon = 0 reply = 0 moment_morning = 0 moment_afternoon = 0 moment_evening = 0 moment_night = 0 retweeted = 0 status = Status.parse(api, json.loads(tweet[0])) if status.id in error_list_tweet_ids: tweets_discarded_error += 1 elif status.text.startswith("RT @"): tweets_discarded_retweet += 1 else: tweets_considered += 1 if regex_username.search(status.text) != None: tweets_username += 1 username = 1 if regex_hashtag.search(status.text) != None: tweets_hashtag += 1 hashtag = 1 if regex_url.search(status.text) != None: tweets_url += 1
def make_mock_statuses(json_text): tweet_array = json.loads(json_text) statuses = Status.parse_list(None, tweet_array) return statuses
def on_data(self, data): full_text = "" data2 = json.loads(data) if 'extended_tweet' in data2: if ('full_text' in data2["extended_tweet"]): full_text = bytes( str(data2["extended_tweet"]["full_text"]).encode("utf-8")) full_text = full_text.decode('utf-8') print( 'FUL TEXT *******************************************************************************' ) print(full_text) #print(self.find_between( data, '"extended_tweet":{"full_text":"','",')) #print(data) if ("retweeted_status" in data2): if ('full_text' in data2["retweeted_status"]): full_text = bytes( str(data2["retweeted_status"]["full_text"]).encode( "utf-8")) full_text = full_text.decode('utf-8') print( 'FUL TEXT *******************************************************************************' ) print(full_text) #print(full_text) data = json.loads(data) if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) if self.on_status(status, full_text) is False: return False elif 'delete' in data: delete = data['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'event' in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif 'direct_message' in data: status = Status.parse(self.api, data) if self.on_direct_message(status) is False: return False elif 'friends' in data: if self.on_friends(data['friends']) is False: return False elif 'limit' in data: if self.on_limit(data['limit']['track']) is False: return False elif 'disconnect' in data: if self.on_disconnect(data['disconnect']) is False: return False elif 'warning' in data: if self.on_warning(data['warning']) is False: return False else: logging.error("Unknown message type: " + str(raw_data))
def on_data(self, data): """ Generic class for site streams that just print each action that comes in - override these methods to actually process them """ if 'for_user' in data: parsed_data = json.loads(data) user_id = parsed_data['for_user'] if 'message' in data: message = parsed_data['message'] if u'friends' in message: if self.on_friends(user_id, message['friends']) is False: return False elif u'event' in message: if message[u'event'] == u'follow': if self.on_follow( user_id=user_id, source=message[u'source'], target=message[u'target'], time=message[u'created_at'] ) is False: return False elif message[u'event'] == u'unfollow': if self.on_unfollow( user_id, source=message[u'source'], target=message[u'target'], time=message[u'created_at'] ) is False: return False elif message[u'event'] == u'favorite': if self.on_favorite( user_id, source=message[u'source'], favorited=message[u'target_object'], time=message[u'created_at'] ) is False: return False elif message[u'event'] == u'unfavorite': if self.on_unfavorite( user_id, source=message[u'source'], favorited=message[u'target_object'] ) is False: return False # Need this second check - could be a retweet of # a tweet mentioning the user of interest elif (u'retweeted_status' in message and int(message[u'retweeted_status'][u'user'][u'id']) == int(user_id) ): if self.on_retweet(user_id, message) is False: return False elif u'text' in message: status = Status.parse(self.api, message) # tweet from the user of interest if status.author.id == user_id: if self.on_user_status(user_id, status) is False: return False else: # tweet mentioning the user of interest if self.on_user_mention(user_id, status) is False: return False elif u'direct_message' in message: if self.on_direct_message( user_id, message[u'direct_message'] ) is False: return False else: print parsed_data
def __init__(self, status: Status): self.created_at: str = preprocess_date( status.__getattribute__("created_at")) self.id: int = status.__getattribute__("id") self.hashtags: list = status.__getattribute__("entities").get( "hashtags", []) self.user_mentions: list = status.__getattribute__("entities").get( "user_mentions", []) # self.urls: list = status.__getattribute__("entities").get("urls", []) # self.media: list = status.__getattribute__("entities").get("media", []) self.text: str = self.get_text(status=status) self.retweet_count: int = status.__getattribute__("retweet_count") self.retweeted: bool = status.__getattribute__("retweeted") self.user_id: int = status.__getattribute__("user").__getattribute__( "id") self.profile_image_url: str = status.__getattribute__( "user").__getattribute__("profile_image_url") self.screen_name: str = status.__getattribute__( "user").__getattribute__("screen_name") self.possibly_sensitive: bool = status.__getattribute__("possibly_sensitive") if \ hasattr(status, 'possibly_sensitive') else False self.favorite_count: int = status.__getattribute__("favorite_count") self.favorited: bool = status.__getattribute__("favorited") self.lang: str = detect(self.text) self.sentiment_analysis: dict = {} self.source: str = status.__getattribute__("source") self.geolocation: str = self.get_geolocation(status=status) self.place: dict = self.get_place(status=status) self.url: str = f"https://twitter.com/user/status/{status.__getattribute__('id')}" self.uuid: str = self.get_128_uuid(data_str=str(self.id)) self.user_uuid: str = self.get_128_uuid(data_str=str(self.user_id))
def load_status(): with open('./tests/cassettes/sample-tweet.json') as infile: status = Status.parse(api=None, json=load(infile)) return status
def update_tweets(self): print "Updating tweets" statuses = [] try: while True: item = self.incoming.pop() # It's gonna throw up someday! if "in_reply_to_status_id" in item: statuses.append(Status.parse(self.stream.api, json.loads(item))) # Ignore anything other than status updates for now #else: # statuses.append(json.loads(item)) except IndexError: pass broadcast = {} broadcast['general'] = {} broadcast['channels'] = {} for s in statuses: tags = re.findall("#([\w]+)(?iu)", s.text) # Case-insensitive, Unicode matching print "Tags: " print tags self.db.execute("INSERT INTO tweets (id, user_id, screen_name, profile_image_url, created_at, text) VALUES (%s,%s,%s,%s,%s,%s)", s.id, s.user.id, s.user.screen_name, s.user.profile_image_url, s.created_at, s.text) # Establish HABTM relationships, tweets with tags for t in tags: t = t.lower() # Force all to lowercase print "Inserting tag: %s" % t self.db.execute('''INSERT INTO hashtags (tag) VALUES (%s) ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), tag=%s; INSERT INTO hashtags_tweets (hash_id, tweet_id) VALUES (LAST_INSERT_ID(), %s)''', t, t, s.id) # Count the votes while we're at it if t in campboard['sessions']: # Attach the tweet to the broadcast channel if not broadcast['channels'].has_key(t): broadcast['channels'][t] = {} broadcast['channels'][t]['recent_tweets'] = [] broadcast['channels'][t]['recent_tweets'].append( { 'text': s.text, 'created_at': unicode(s.created_at), 'id': s.id, 'user': { 'id': s.user.id, 'screen_name': s.user.screen_name, 'profile_image_url': s.user.profile_image_url } } ) vote_type = None if re.search('\+1', s.text): #vote_type = "positive" self.db.execute("INSERT INTO session_votes (`session`, positive) VALUES (%s, 1) ON DUPLICATE KEY UPDATE positive=positive+1", t) elif re.search('\-1', s.text): #vote_type = "negative" self.db.execute("INSERT INTO session_votes (`session`, negative) VALUES (%s, 1) ON DUPLICATE KEY UPDATE negative=negative+1", t) broadcast['general']['recent_tweets'] = [ { 'text': s.text, 'created_at': unicode(s.created_at), 'id': s.id, 'user': { 'id': s.user.id, 'screen_name': s.user.screen_name, 'profile_image_url': s.user.profile_image_url } } for s in statuses ] return broadcast
def bulk_load(listkey, tweets): with open("/home/marcua/data/tweets/%s" % (listkey), "w") as tmpfile: print "file %s" % (tmpfile.name) for jsontweet in tweets: tweet = Status.parse(api, json.loads(jsontweet)) tmpfile.write(convert_to_utf8_str(tweet.text) + "\n")
import unittest import logging import sys from tweepy.models import Status from TwitterWatcher.tweet_tracker import TweetTracker from tests.database.mock_database import MockDatabase dummy_status = Status() dummy_status._json = { 'id': 1, 'id_str': '1', 'text': 'test', 'user': { 'screen_name': 'test_user' } } dummy_reply = Status() dummy_reply._json = { 'id': 2, 'id_str': '2', 'text': 'test reply', 'user': { 'screen_name': 'test_reply_user' }, 'in_reply_to_status_id': 1 } class TwitterWatcherDatabaseTests(unittest.TestCase): def setUp(self):
def on_data(self, raw_data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ self.count += 1 data = json.loads(raw_data) if self.count >50000: self.statusf.close() self.userf.close() self.deletef.close() self.count = 0 ts = time.strftime("./data/%Y%m%d%H%M") self.statusf = open(ts+'_status.csv','w',newline='') self.statusw = csv.writer(self.statusf) self.statusw.writerow(['id', 'created_at', 'coordinates',\ 'hashtags', 'user_mentions', 'symbols', 'urls', \ 'media', \ 'in_reply_to_screen_name', \ 'in_reply_to_user_id_str', \ 'in_reply_to_status_id_str', \ 'place', 'retweeted_status_id', 'source', \ 'text', 'user id' \ # some other attributes exsits, they are list below #, status.withheld_copyright, \#optional #status.withheld_in_countries, \#optional #status.withheld_scope, \#optional #status.truncated, \#default False #status.retweeted, status.retweet_count, \#for no rt #status.scopes, possibly_sensitive, \ #status.lang, status.fiter_level, \lang=en #status.favorited, status.favorite_count, \ #status.current_user_retweet, \ #status.contributors, status.annotations \ ]) self.userf = open(ts+'_user.csv','w',newline='') self.userw = csv.writer(self.userf) self.userw.writerow(['created_at', 'default_profile', \ #user.default_profile_image, \ 'description', \ #user.entities, \ 'favourites_count', \ #user.follow_request_sent, user.following,\#relate to given user 'followers_count', 'friends_count', \ 'geo_enabled', 'id_str', 'is_translator', \ 'lang', 'listed_count', 'location', \ #user.notifications, \ 'name', \ #user.profile_background_color, user.profile_background_image_url, \ #user.profile_background_image_url_https, user.profile_background_tile, \ #user.profile_banner_url, user.profile_image_url, \ #user.profile_image_url_https, user.profile_link_color, \ #user.profile_sidebar_border_color, user.profile_sidebar_fill_color, \ #user.profile_text_color, user.profile_use_background_image, \ 'protected', 'screen_name', \ #user.show_all_inline_media, user.status, \ 'statuses_count', 'time_zone', 'user.url', \ #user.utc_offset, \ #user.withheld_in_countries, user.withheld_scope, 'verified']) self.deletef = open(ts+'_delete.csv','w',newline='') self.deletew = csv.writer(self.deletef) self.deletew.writerow(['status_id','user_id']) if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) if self.on_status(status) is False: return False elif 'delete' in data: delete = data['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'event' in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif 'limit' in data: if self.on_limit(data['limit']['track']) is False: return False elif 'disconnect' in data: if self.on_disconnect(data['disconnect']) is False: return False elif 'warning' in data: if self.on_warning(data['warning']) is False: return False else: logging.error("Unknown message type: " + str(raw_data)) return False return True
hashtag = 0 url = 0 question = 0 exclamation = 0 pos_term = 0 neg_term = 0 pos_emoticon = 0 neg_emoticon = 0 reply = 0 moment_morning = 0 moment_afternoon = 0 moment_evening = 0 moment_night = 0 retweeted = 0 status = Status.parse(api, tweet) if tweet['id'] in error_list_tweet_ids: tweets_discarded_error += 1 elif tweet['text'].startswith("RT @"): tweets_discarded_retweet += 1 else: tweets_considered += 1 if regex_username.search(tweet['text']) != None: tweets_username += 1 username = 1 if regex_hashtag.search(tweet['text']) != None: tweets_hashtag += 1 hashtag = 1 if regex_url.search(tweet['text']) != None: tweets_url += 1
def setUp(self): def load_status(): with open('./tests/cassettes/sample-tweet.json') as infile: status = Status.parse(api=None, json=load(infile)) return status self._status = Status.parse( api=None, json={ 'created_at': 'Fri Dec 01 01:53:45 +0000 2017', 'id': 936412976520876032, 'id_str': '936412976520876032', 'text': '@realDonaldTrump https://t.co/0BW86RBIRH', 'display_text_range': [17, 40], 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': 936395008139198464, 'in_reply_to_status_id_str': '936395008139198464', 'in_reply_to_user_id': 25073877, 'in_reply_to_user_id_str': '25073877', 'in_reply_to_screen_name': 'realDonaldTrump', 'user': { 'id': 29363354, 'id_str': '29363354', 'name': 'Kate', 'screen_name': 'k8_doo', 'location': 'United States', 'url': None, 'description': 'Follow me if you want to know how far I walked, hiked or ran today for #charitymiles', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 322, 'friends_count': 943, 'listed_count': 3, 'favourites_count': 26916, 'statuses_count': 3334, 'created_at': 'Tue Apr 07 02:56:52 +0000 2009', 'utc_offset': -18000, 'time_zone': 'Eastern Time (US & Canada)', 'geo_enabled': True, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'EBEBEB', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme7/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme7/bg.gif', 'profile_background_tile': False, 'profile_link_color': '990000', 'profile_sidebar_border_color': 'DFDFDF', 'profile_sidebar_fill_color': 'F3F3F3', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/823305825297006593/LhjPdILK_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/823305825297006593/LhjPdILK_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/29363354/1485126381', 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None }, 'geo': None, 'coordinates': None, 'place': { 'bounding_box': { 'coordinates': [[1, 2], [3, 2, 1]] } }, 'contributors': None, 'quoted_status_id': 936379603651883008, 'quoted_status_id_str': '936379603651883008', 'quoted_status': { 'created_at': 'Thu Nov 30 23:41:09 +0000 2017', 'id': 936379603651883008, 'id_str': '936379603651883008', 'text': 'On the left: @BarackObama’s National Tree Lighting\nOn the right: @realDonaldTrump’s National Tree Lighting… https://t.co/PcsatAL7Lu', 'display_text_range': [0, 140], 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': True, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': { 'id': 329433192, 'id_str': '329433192', 'name': 'Jeremy Dickey', 'screen_name': 'JeremyDDickey', 'location': 'Washington, D.C.', 'url': 'https://medium.com/@JeremyDDickey', 'description': 'City Government Media Specialist. Aspiring CJ Cregg. Graduate of @MercyhurstU & @LCCLondon. RTs = you got my attention. Tweets are my own. Sarcasm also my own.', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 1860, 'friends_count': 2452, 'listed_count': 129, 'favourites_count': 5864, 'statuses_count': 64253, 'created_at': 'Tue Jul 05 02:20:11 +0000 2011', 'utc_offset': -18000, 'time_zone': 'Eastern Time (US & Canada)', 'geo_enabled': True, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': '1A1B1F', 'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/474534472373649408/gaee5mbF.png', 'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/474534472373649408/gaee5mbF.png', 'profile_background_tile': False, 'profile_link_color': '3B94D9', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': '252429', 'profile_text_color': '666666', 'profile_use_background_image': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/932429063280627713/HnHFID4p_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/932429063280627713/HnHFID4p_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/329433192/1443752276', 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None }, 'geo': None, 'coordinates': None, 'place': { 'id': '6417871953fa5e86', 'url': 'https://api.twitter.com/1.1/geo/id/6417871953fa5e86.json', 'place_type': 'city', 'name': 'Silver Spring', 'full_name': 'Silver Spring, MD', 'country_code': 'US', 'country': 'United States', 'bounding_box': { 'type': 'Polygon', 'coordinates': [[[-77.064086, 38.979735], [-77.064086, 39.036964], [-76.97162, 39.036964], [-76.97162, 38.979735]]] }, 'attributes': {} }, 'contributors': None, 'is_quote_status': False, 'extended_tweet': { 'full_text': 'On the left: @BarackObama’s National Tree Lighting\nOn the right: @realDonaldTrump’s National Tree Lighting #Christmas https://t.co/wYoLJRO2r6', 'display_text_range': [0, 117], 'entities': { 'hashtags': [{ 'text': 'Christmas', 'indices': [107, 117] }], 'urls': [], 'user_mentions': [{ 'screen_name': 'BarackObama', 'name': 'Barack Obama', 'id': 813286, 'id_str': '813286', 'indices': [13, 25] }, { 'screen_name': 'realDonaldTrump', 'name': 'Donald J. Trump', 'id': 25073877, 'id_str': '25073877', 'indices': [65, 81] }], 'symbols': [], 'media': [{ 'id': 936379576682450944, 'id_str': '936379576682450944', 'indices': [118, 141], 'media_url': 'http://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg', 'media_url_https': 'https://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg', 'url': 'https://t.co/wYoLJRO2r6', 'display_url': 'pic.twitter.com/wYoLJRO2r6', 'expanded_url': 'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1', 'type': 'photo', 'sizes': { 'medium': { 'w': 1200, 'h': 800, 'resize': 'fit' }, 'small': { 'w': 680, 'h': 453, 'resize': 'fit' }, 'thumb': { 'w': 150, 'h': 150, 'resize': 'crop' }, 'large': { 'w': 1752, 'h': 1168, 'resize': 'fit' } } }, { 'id': 936379575839358977, 'id_str': '936379575839358977', 'indices': [118, 141], 'media_url': 'http://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg', 'media_url_https': 'https://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg', 'url': 'https://t.co/wYoLJRO2r6', 'display_url': 'pic.twitter.com/wYoLJRO2r6', 'expanded_url': 'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1', 'type': 'photo', 'sizes': { 'small': { 'w': 680, 'h': 680, 'resize': 'fit' }, 'thumb': { 'w': 150, 'h': 150, 'resize': 'crop' }, 'medium': { 'w': 1200, 'h': 1200, 'resize': 'fit' }, 'large': { 'w': 2048, 'h': 2048, 'resize': 'fit' } } }] }, 'extended_entities': { 'media': [{ 'id': 936379576682450944, 'id_str': '936379576682450944', 'indices': [118, 141], 'media_url': 'http://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg', 'media_url_https': 'https://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg', 'url': 'https://t.co/wYoLJRO2r6', 'display_url': 'pic.twitter.com/wYoLJRO2r6', 'expanded_url': 'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1', 'type': 'photo', 'sizes': { 'medium': { 'w': 1200, 'h': 800, 'resize': 'fit' }, 'small': { 'w': 680, 'h': 453, 'resize': 'fit' }, 'thumb': { 'w': 150, 'h': 150, 'resize': 'crop' }, 'large': { 'w': 1752, 'h': 1168, 'resize': 'fit' } } }, { 'id': 936379575839358977, 'id_str': '936379575839358977', 'indices': [118, 141], 'media_url': 'http://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg', 'media_url_https': 'https://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg', 'url': 'https://t.co/wYoLJRO2r6', 'display_url': 'pic.twitter.com/wYoLJRO2r6', 'expanded_url': 'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1', 'type': 'photo', 'sizes': { 'small': { 'w': 680, 'h': 680, 'resize': 'fit' }, 'thumb': { 'w': 150, 'h': 150, 'resize': 'crop' }, 'medium': { 'w': 1200, 'h': 1200, 'resize': 'fit' }, 'large': { 'w': 2048, 'h': 2048, 'resize': 'fit' } } }] } }, 'quote_count': 56, 'reply_count': 44, 'retweet_count': 326, 'favorite_count': 385, 'entities': { 'hashtags': [], 'urls': [{ 'url': 'https://t.co/PcsatAL7Lu', 'expanded_url': 'https://twitter.com/i/web/status/936379603651883008', 'display_url': 'twitter.com/i/web/status/9…', 'indices': [108, 131] }], 'user_mentions': [{ 'screen_name': 'BarackObama', 'name': 'Barack Obama', 'id': 813286, 'id_str': '813286', 'indices': [13, 25] }, { 'screen_name': 'realDonaldTrump', 'name': 'Donald J. Trump', 'id': 25073877, 'id_str': '25073877', 'indices': [65, 81] }], 'symbols': [] }, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'filter_level': 'low', 'lang': 'en' }, 'is_quote_status': True, 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'entities': { 'hashtags': [], 'urls': [{ 'url': 'https://t.co/0BW86RBIRH', 'expanded_url': 'https://twitter.com/jeremyddickey/status/936379603651883008', 'display_url': 'twitter.com/jeremyddickey/…', 'indices': [17, 40] }], 'user_mentions': [{ 'screen_name': 'realDonaldTrump', 'name': 'Donald J. Trump', 'id': 25073877, 'id_str': '25073877', 'indices': [0, 16] }], 'symbols': [] }, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'filter_level': 'low', 'lang': 'und', 'timestamp_ms': '1512093225971' }) self._status_backup = deepcopy(self._status)
from tweepy.models import Status from teebr.text.utils import normalize_text from teebr.features import filter_status CLUSTERS = 40 DIMS = 100 tweets = [] #tw_count = 0 with open("raw_tweets.jsons") as f: for line in f: j = loads(line) t = Status.parse(None, j) if filter_status(t): tweet = normalize_text(t.text) tweets.append(tweet) #tw_count += 1 #if tw_count >= 2000: # break # less tweets for the tests #tweets = tweets[:10000] print "tweets: %d" % len(tweets) #hasher = HashingVectorizer(stop_words='english', non_negative=True, norm=None) #vectorizer = make_pipeline(hasher, TfidfTransformer())
def gen_tuple(jsontweet): tweet = Status.parse(api, json.loads(jsontweet)) retweeted = (getattr(tweet, 'retweeted_status', None) != None) return (tweet.author.id, tweet.created_at, convert_to_utf8_str(tweet.text), retweeted)
def on_data(self, data): '''Parse raw data from twitter and pass the status object to on_status() Call when raw data is passed from twitter. If this function return False, it stop listening to the streamining. gSave_raw_json: if true, write json raw text to the ../json/ Set it to true only if you would like to debug. ''' try: self.on_data_running = True self.log("Get raw data from Twitter", screen_only=True) if gSave_raw_json: ### save the json into disk ### parsed_data = tweepy.utils.import_simplejson().loads(data) if "id" not in parsed_data.keys(): #may return {"limit":{"track":73}} or {delete...}, ignore this data return True #chucheng, this line is equal to check if 'delete'/;limit' in data folder_name = parsed_data["id"]%1000 try: if not os.path.exists("../json/"+str(folder_name)): os.makedirs("../json/"+str(folder_name)) except OSError as ose: self.log("OS ERROR") pass filename = "../json/"+str(folder_name) + "/" + str(parsed_data["id"]) + ".json" #print filename # for debug output = open(filename,"w") output.write(data) output.write('\n') output.close() ### done ### # Chucheng 4/25/2011: # We must override the method, because the original one might # return false, cause a stop of the listerner. # In short, you cannot simply call: # tweepy.StreamListener.on_data(self, data) if 'in_reply_to_status_id' in data: status = Status.parse(self.api, json.loads(data)) if self.on_status(status) is False: #Trigger on_status now!! self.log('in_reply_to_status_id in data: on_status() returns False. (this line should never be reached)') else: pass #do nothing, the data we get is not what we need. """ These lines should never be triggered in that we check : elif 'delete' in data: delete = json.loads(data)['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: self.log('delete in data: a delete notice arrives for a status') elif 'limit' in data: if self.on_limit(json.loads(data)['limit']['track']) is False: self.log('limit in data: a limitation notice arrvies') """ self.on_data_running = False # This variable signal whether # we are in the middle of processing data. if self.running == False: # see: StreamingCrawler.stop_listner() return False #stop the listener while catching a SIGTERM except Exception as e: self.on_data_running = False self.log("Error:" + str(e), sys.exc_traceback) return True
def __init__(self, status: Status): self.created_at: datetime = get_datetime_from_date( status.__getattribute__("created_at")) self.id: int = status.__getattribute__("id") self.hashtags: list = status.__getattribute__("entities").get( "hashtags", []) self.user_mentions: list = status.__getattribute__("entities").get( "user_mentions", []) self.urls: list = status.__getattribute__("entities").get("urls", []) self.media: list = status.__getattribute__("entities").get("media", []) self.is_quote_status: bool = status.__getattribute__("is_quote_status") self.quote_count: int = status.__getattribute__("quote_count") if\ hasattr(status, 'quote_count') else 0 self.text: str = self.get_text(status=status) self.retweet_count: int = status.__getattribute__("retweet_count") self.retweeted: bool = status.__getattribute__("retweeted") self.user_id: int = status.__getattribute__("user").__getattribute__( "id") self.possibly_sensitive: bool = status.__getattribute__("possibly_sensitive") if\ hasattr(status, 'possibly_sensitive') else False self.favorite_count: int = status.__getattribute__("favorite_count") self.favorited: bool = status.__getattribute__("favorited") self.lang: str = detect(self.text) self.url: str = f"https://twitter.com/user/status/{status.__getattribute__('id')}" self.sentiment_analysis: dict = {} self.source: str = status.__getattribute__("source") self.coordinates: dict = status.__getattribute__("coordinates") if\ hasattr(status, 'coordinates') else {} self.place: dict = self.get_place(status=status) self.reply_count: int = status.__getattribute__("reply_count") if\ hasattr(status, 'quote_count') else 0 self.uuid: str = self.get_128_uuid(data_str=str(self.id)) self.user_uuid: str = self.get_128_uuid(data_str=str(self.user_id))
def test_skip_check(): filt = skip_check([]) tweet = Status() tweet.text = 'This is a test #nowplaying' assert filt(tweet) is True
def test_skip_check_custom(text, passed): filt = skip_check(['#nowplaying', '@boring']) tweet = Status() tweet.text = text assert filt(tweet) is passed
def bulk_load(listkey, tweets): with open('/home/marcua/data/tweets/%s' % (listkey), 'w') as tmpfile: print "file %s" % (tmpfile.name) for jsontweet in tweets: tweet = Status.parse(api, json.loads(jsontweet)) tmpfile.write(convert_to_utf8_str(tweet.text) + "\n")
def save_tweets(self): while True: raw_data = self.q.get() data = json.loads(raw_data) if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) is_retweet = False retweeted_id = 0 if hasattr(status, 'retweeted_status'): is_retweet = True retweeted_id = status.retweeted_status.id if hasattr(status.retweeted_status, 'extended_tweet'): text = status.retweeted_status.extended_tweet[ 'full_text'] else: text = status.retweeted_status.text else: if hasattr(status, 'extended_tweet'): text = status.extended_tweet['full_text'] else: text = status.text is_quote = hasattr(status, "quoted_status") quoted_text = "" quoted_id = 0 if is_quote: quoted_id = status.quoted_status.id if hasattr(status.quoted_status, "extended_tweet"): quoted_text = status.quoted_status.extended_tweet[ "full_text"] else: quoted_text = status.quoted_status.text for keyword_obj in self.keyword_obj_list: keyword = keyword_obj.keyword if keyword.lower() in text.lower() or keyword.lower( ) in quoted_text.lower(): tweet_obj = Tweet.objects.create( keyword=keyword_obj, tweet_id=status.id, created_at=make_aware(status.created_at), user_id=status.user.id, retweeted_id=retweeted_id, quoted_id=quoted_id, text=text, quoted_text=quoted_text) lang = detect(keyword) if lang == 'en': text = text_utils.pre_process(text) triple_list = knowledge_graph_extract.extract_entity( text, lang=lang) for triple in triple_list: Knowledge.objects.create(tweet=tweet_obj, k_subject=triple[0], k_predicate=triple[1], k_object=triple[2], subject_type=triple[3], object_type=triple[4]) self.q.task_done()
def on_data(self, data): tweet = Status.parse(tweepy_api, json.loads(data)) self.handler(tweet)
def parse_tweet(tweet): """ Parse a JSON tweet into a tweepy object and insert missing author. """ t = Status.parse(self.api, tweet) t.author = current_user return t
def on_data(self, raw_data): """Called when raw data is received from connection. Override this method if you wish to manually handle the stream data. Return False to stop stream and close connection. """ data = json.loads(raw_data) if 'in_reply_to_status_id' in data: status = Status.parse(self.api, data) if self.on_status(status) is False: return False elif 'delete' in data: delete = data['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'event' in data: status = Status.parse(self.api, data) if self.on_event(status) is False: return False elif 'direct_message' in data: status = Status.parse(self.api, data) if self.on_direct_message(status) is False: return False elif 'friends' in data: if self.on_friends(data['friends']) is False: return False elif 'limit' in data: if self.on_limit(data['limit']['track']) is False: return False elif 'disconnect' in data: if self.on_disconnect(data['disconnect']) is False: return False elif 'warning' in data: if self.on_warning(data['warning']) is False: return False else: return False # If this tweet contains text. if "user" in list(data.keys()): # --------------------------------------------------------------- # # Stupid print for fun. uname = data["user"]["screen_name"] umsg = data["text"] nspc = (20 - len(uname)) if nspc < 1: nspc = 1 spc = " " * nspc if not umsg.startswith("RT"): print("<tweet>", uname, spc, umsg.replace("\n", "")) # --------------------------------------------------------------- # # Write the tweet to the buffer. self.buffer.write(raw_data) # Running counter. self.count += 1 # If the buffer is full, then cycle the buffer. if self.count % self.save_interval == 0: self.swap_buffer() # If the counter is a check-in interval, do all the check-in tasks. if self.count % check_in_interval == 0: # Shutdown if the `runtime` `run` value is False. if checkin_killstream(): return False # pause if there are too many files in the new tweet directory. if not checkin_pausestream(): return False