def parse(cls, api, json): status = cls(api) setattr(status, '_json', json) for k, v in json.items(): if k == 'user': user_model = getattr(api.parser.model_factory, 'user') if api else User user = user_model.parse(api, v) setattr(status, 'author', user) setattr(status, 'user', user) # DEPRECIATED elif k == 'created_at': setattr(status, k, parse_datetime(v)) elif k == 'source': if '<' in v: setattr(status, k, parse_html_value(v)) setattr(status, 'source_url', parse_a_href(v)) else: setattr(status, k, v) setattr(status, 'source_url', None) elif k == 'retweeted_status': setattr(status, k, Status.parse(api, v)) elif k == 'place': if v is not None: setattr(status, k, Place.parse(api, v)) else: setattr(status, k, None) else: setattr(status, k, v) return status
def parse(cls, api, json): status = cls(api) setattr(status, '_json', json) for k, v in json.items(): if k == 'user': user_model = getattr(api.parser.model_factory, 'user') if api else User user = user_model.parse(api, v) setattr(status, 'author', user) setattr(status, 'user', user) # DEPRECIATED elif k == 'created_at': setattr(status, k, parse_datetime(v)) elif k == 'source': if '<' in v: setattr(status, k, parse_html_value(v)) setattr(status, 'source_url', parse_a_href(v)) else: setattr(status, k, v) setattr(status, 'source_url', None) elif k == 'retweeted_status': setattr(status, k, Status.parse(api, v)) elif k == 'quoted_status': setattr(status, k, Status.parse(api, v)) elif k == 'place': if v is not None: setattr(status, k, Place.parse(api, v)) else: setattr(status, k, None) else: setattr(status, k, v) return status
def parse(cls, api, json): status = cls(api) setattr(status, "_json", json) for k, v in json.items(): if k == "user": user_model = getattr(api.parser.model_factory, "user") if api else User user = user_model.parse(api, v) setattr(status, "author", user) setattr(status, "user", user) # DEPRECIATED elif k == "created_at": setattr(status, k, parse_datetime(v)) elif k == "source": if "<" in v: setattr(status, k, parse_html_value(v)) setattr(status, "source_url", parse_a_href(v)) else: setattr(status, k, v) setattr(status, "source_url", None) elif k == "retweeted_status": setattr(status, k, Status.parse(api, v)) elif k == "place": if v is not None: setattr(status, k, Place.parse(api, v)) else: setattr(status, k, None) else: setattr(status, k, v) return status
def parse(cls, api, json): result = cls() for k, v in json.items(): if k == "created_at": setattr(result, k, parse_search_datetime(v)) elif k == "source": setattr(result, k, parse_html_value(unescape_html(v))) else: setattr(result, k, v) return result
def parse(cls, api, json): result = cls() for k, v in json.items(): if k == 'created_at': setattr(result, k, parse_search_datetime(v)) elif k == 'source': setattr(result, k, parse_html_value(unescape_html(v))) else: setattr(result, k, v) return result
def parse(cls, api, json): status = cls(api) for k, v in json.items(): if k == 'user': user = User.parse(api, v) setattr(status, 'author', user) setattr(status, 'user', user) # DEPRECIATED elif k == 'created_at': setattr(status, k, parse_datetime(v)) elif k == 'source': if '<' in v: setattr(status, k, parse_html_value(v)) setattr(status, 'source_url', parse_a_href(v)) else: setattr(status, k, v) elif k == 'retweeted_status': setattr(status, k, User.parse(api, v)) else: setattr(status, k, v) return status
def parse(cls, api, json): status = cls(api) for k, v in json.items(): if k == 'user': user = User.parse(api, v) setattr(status, 'author', user) setattr(status, 'user', user) # DEPRECIATED elif k == 'created_at': setattr(status, k, parse_datetime(v)) elif k == 'source': if '<' in v: setattr(status, k, parse_html_value(v)) setattr(status, 'source_url', parse_a_href(v)) else: setattr(status, k, v) elif k == 'retweeted_status': setattr(status, k, Status.parse(api, v)) else: setattr(status, k, v) return status
def on_data(self, data): statuse = json.loads(data) if 'delete' in statuse: return True # keep stream alive if 'id' in statuse: statuse_quoted_text = None geoloc = None url_expanded = None url_media = None type_media = None text = None location = None description = None name = None date = parse_datetime(statuse['created_at']) app = parse_html_value(statuse['source']) entities = None relation = None quoted_id = None replied_id = None retweeted_id = None user_replied = None user_quoted = None user_retweeted = None first_HT = None #get interactions Ids try: id_tweet = statuse['id_str'] if statuse['in_reply_to_status_id_str'] != None: relation = 'reply' replied_id = statuse['in_reply_to_status_id_str'] user_replied = statuse['in_reply_to_screen_name'] if 'quoted_status' in statuse: relation = 'quote' quoted_id = statuse['quoted_status_id_str'] user_quoted = statuse['quoted_status']['user'][ 'screen_name'] elif 'retweeted_status' in statuse: relation = 'RT' retweeted_id = statuse['retweeted_status']['id_str'] user_retweeted = statuse['retweeted_status']['user'][ 'screen_name'] if 'quoted_status' in statuse['retweeted_status']: quoted_id = statuse['retweeted_status'][ 'quoted_status']['id_str'] user_quoted = statuse['retweeted_status'][ 'quoted_status']['user']['screen_name'] except: text_error = '---------------->bad interactions ids, id tweet %s at %s\n' % ( id_tweet, time.asctime()) self.f_log.write(text_error) #get geolocation if 'coordinates' in statuse: coordinates = statuse['coordinates'] if coordinates != None: try: if 'coordinates' in coordinates: list_geoloc = coordinates['coordinates'] print list_geoloc geoloc = '%s, %s' % (list_geoloc[0], list_geoloc[1]) except: text_error = '---------------->bad coordinates, id tweet %s at %s\n' % ( id_tweet, datetime.datetime.now()) self.f_log.write(text_error) #get entities if 'entities' in statuse: entities = statuse['entities'] if 'extended_tweet' in statuse: entities = statuse['extended_tweet']['entities'] if 'retweeted_status' in statuse: if 'entities' in statuse['retweeted_status']: entities = statuse['retweeted_status']['entities'] if 'extended_tweet' in statuse['retweeted_status']: entities = statuse['retweeted_status']['extended_tweet'][ 'entities'] if entities != None: try: urls = entities['urls'] if len(urls) > 0: url_expanded = urls[0]['expanded_url'] except: text_error = '---------------->bad enttity urls, id tweet %s at %s\n' % ( id_tweet, datetime.datetime.now()) self.f_log.write(text_error) try: if 'media' in entities: list_media = entities['media'] if len(list_media) > 0: url_media = list_media[0]['media_url'] type_media = list_media[0]['type'] except: text_error = '---------------->bad entity media, at %s id tweet %s \n' % ( datetime.datetime.now(), id_tweet) self.f_log.write(text_error) try: if 'hashtags' in entities: HTs = entities['hashtags'] if len(HTs) > 0: first_HT = HTs[0]['text'] except: text_error = '---------------->bad entity HT, id tweet %s at %s\n' % ( id_tweet, time.asctime()) self.f_log.write(text_error) #get text try: if 'text' in statuse: text = re.sub('[\r\n\t]+', ' ', statuse['text']) if 'extended_tweet' in statuse: text = re.sub('[\r\n\t]+', ' ', statuse['extended_tweet']['full_text']) if 'retweeted_status' in statuse: statuse_RT = statuse['retweeted_status'] if 'text' in statuse_RT: RT_expand = re.sub('[\r\n\t]+', ' ', statuse_RT['text']) if 'extended_tweet' in statuse_RT: extended_RT = statuse_RT['extended_tweet'] RT_expand = re.sub('[\r\n\t]+', ' ', extended_RT['full_text']) RT = re.match(r'(^RT @\w+: )', text) if RT: text = RT.group(1) + RT_expand except: text_error = '---------------->bad tweet text, at %s id tweet %s \n' % ( datetime.datetime.now(), id_tweet) self.f_log.write(text_error) #get quoted if exist try: if 'quoted_status' in statuse: if 'text' in statuse['quoted_status']: statuse_quoted_text = statuse['quoted_status']['text'] if 'extended_tweet' in statuse['quoted_status']: statuse_quoted_text = statuse['quoted_status'][ 'extended_tweet']['full_text'] statuse_quoted_text = re.sub('[\r\n\t]+', ' ', statuse_quoted_text) elif 'retweeted_status' in statuse: if 'quoted_status' in statuse['retweeted_status']: if 'text' in statuse['retweeted_status'][ 'quoted_status']: statuse_quoted_text = statuse['retweeted_status'][ 'quoted_status']['text'] if 'extended_tweet' in statuse['retweeted_status'][ 'quoted_status']: statuse_quoted_text = statuse['retweeted_status'][ 'quoted_status']['extended_tweet']['full_text'] statuse_quoted_text = re.sub('[\r\n\t]+', ' ', statuse_quoted_text) except: text_error = '---------------->bad quoted, at %s id tweet %s \n' % ( datetime.datetime.now(), id_tweet) self.f_log.write(text_error) #get user profile if 'user' in statuse: try: if 'location' in statuse['user']: if statuse['user']['location'] != None: location = re.sub('[\r\n\t]+', ' ', statuse['user']['location'], re.UNICODE) except: text_error = '---------------->bad user location:%s , at %s id tweet %s \n' % ( datetime.datetime.now(), statuse['user']['location'], id_tweet) self.f_log.write(text_error) try: if 'description' in statuse['user']: if statuse['user']['description'] != None: description = re.sub( '[\r\n\t]+', ' ', statuse['user']['description'], re.UNICODE) except: text_error = '---------------->bad user description, at %s id tweet %s \n' % ( datetime.datetime.now(), id_tweet) self.f_log.write(text_error) try: if 'name' in statuse['user']: if statuse['user']['name'] != None: name = re.sub('[\r\n\t]+', ' ', statuse['user']['name'], re.UNICODE) except: text_error = '---------------->bad user name, at %s id tweet %s \n' % ( datetime.datetime.now(), id_tweet) self.f_log.write(text_error) try: link_tweet = 'https://twitter.com/%s/status/%s' % ( statuse['user']['screen_name'], id_tweet) tweet = '%s\t%s\t@%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ( id_tweet, date, statuse['user']['screen_name'], text, app, statuse['user']['id'], statuse['user']['followers_count'], statuse['user']['friends_count'], statuse['user']['statuses_count'], location, url_expanded, geoloc, name, description, url_media, type_media, statuse_quoted_text, relation, replied_id, user_replied, retweeted_id, user_retweeted, quoted_id, user_quoted, first_HT, statuse['lang'], parse_datetime(statuse['user']['created_at']), statuse['user']['verified'], statuse['user']['profile_image_url_https'], link_tweet) self.f_out.write(tweet) print '---->collected tweet', id_tweet except: text_error = '---------------> format error at %s, id-tweet %s\n' % ( datetime.datetime.now(), id_tweet) self.f_log.write(text_error) pass else: text_error = '---------------> message no expected %s, %s\n' % ( datetime.datetime.now(), data) self.f_log.write(text_error) return True # keep stream alive
def on_data(self, data): statuse = json.loads(data) if 'delete' in statuse: return True # keep stream alive if 'id' in statuse: statuse_quoted_text = None geoloc = None url_expanded = None url_media = None type_media = None text = None location = None description = None name = None date = None app = None try: id_tweet = statuse['id'] recent_tweet = id_tweet profile_user = statuse['user'] if 'quoted_status_id' in statuse: print statuse['quoted_status_id'] if 'quoted_status' in statuse: statuse_quoted = statuse['quoted_status'] if 'text' in statuse_quoted: statuse_quoted_text = statuse_quoted['text'] statuse_quoted_text = re.sub( '[\r\n\t]+', ' ', statuse_quoted_text) print 'tweet nested', statuse_quoted_text if 'coordinates' in statuse: coordinates = statuse['coordinates'] if coordinates != None: list_geoloc = coordinates['coordinates'] geoloc = '%s, %s' % (list_geoloc[0], list_geoloc[1]) if 'entities' in statuse: entities = statuse['entities'] urls = entities['urls'] if len(urls) > 0: url = urls[0] url_expanded = url['expanded_url'] text = re.sub('[\r\n\t]+', ' ', statuse['text']) if profile_user['location'] != None: location = re.sub('[\r\n\t]+', ' ', profile_user['location'], re.UNICODE) if profile_user['description'] != None: description = re.sub('[\r\n\t]+', ' ', profile_user['description'], re.UNICODE) if profile_user['name'] != None: name = re.sub('[\r\n\t]+', ' ', profile_user['name'], re.UNICODE) date = parse_datetime(statuse['created_at']) app = parse_html_value(statuse['source']) tweet = '%s\t%s\t@%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ( id_tweet, date, profile_user['screen_name'], text, app, profile_user['id'], profile_user['followers_count'], profile_user['friends_count'], profile_user['statuses_count'], location, url_expanded, geoloc, name, description, url_media, type_media, statuse_quoted_text) self.f_out.write(tweet) print '---->collected tweet', id_tweet except: text_error = '---------------> parser error at %s, id-tweet %s\n' % ( datetime.datetime.now(), statuse) self.f_log.write(text_error) pass else: text_error = '---------------> message no expected %s, %s\n' % ( datetime.datetime.now(), data) self.f_log.write(text_error) return True # keep stream alive