Пример #1
0
 def parse(cls, api, json):
     status = cls(api)
     setattr(status, '_json', json)
     for k, v in json.items():
         if k == 'user':
             user_model = getattr(api.parser.model_factory, 'user') if api else User
             user = user_model.parse(api, v)
             setattr(status, 'author', user)
             setattr(status, 'user', user)  # DEPRECIATED
         elif k == 'created_at':
             setattr(status, k, parse_datetime(v))
         elif k == 'source':
             if '<' in v:
                 setattr(status, k, parse_html_value(v))
                 setattr(status, 'source_url', parse_a_href(v))
             else:
                 setattr(status, k, v)
                 setattr(status, 'source_url', None)
         elif k == 'retweeted_status':
             setattr(status, k, Status.parse(api, v))
         elif k == 'place':
             if v is not None:
                 setattr(status, k, Place.parse(api, v))
             else:
                 setattr(status, k, None)
         else:
             setattr(status, k, v)
     return status
Пример #2
0
 def parse(cls, api, json):
     status = cls(api)
     setattr(status, '_json', json)
     for k, v in json.items():
         if k == 'user':
             user_model = getattr(api.parser.model_factory, 'user') if api else User
             user = user_model.parse(api, v)
             setattr(status, 'author', user)
             setattr(status, 'user', user)  # DEPRECIATED
         elif k == 'created_at':
             setattr(status, k, parse_datetime(v))
         elif k == 'source':
             if '<' in v:
                 setattr(status, k, parse_html_value(v))
                 setattr(status, 'source_url', parse_a_href(v))
             else:
                 setattr(status, k, v)
                 setattr(status, 'source_url', None)
         elif k == 'retweeted_status':
             setattr(status, k, Status.parse(api, v))
         elif k == 'quoted_status':
             setattr(status, k, Status.parse(api, v))
         elif k == 'place':
             if v is not None:
                 setattr(status, k, Place.parse(api, v))
             else:
                 setattr(status, k, None)
         else:
             setattr(status, k, v)
     return status
Пример #3
0
 def parse(cls, api, json):
     status = cls(api)
     setattr(status, "_json", json)
     for k, v in json.items():
         if k == "user":
             user_model = getattr(api.parser.model_factory, "user") if api else User
             user = user_model.parse(api, v)
             setattr(status, "author", user)
             setattr(status, "user", user)  # DEPRECIATED
         elif k == "created_at":
             setattr(status, k, parse_datetime(v))
         elif k == "source":
             if "<" in v:
                 setattr(status, k, parse_html_value(v))
                 setattr(status, "source_url", parse_a_href(v))
             else:
                 setattr(status, k, v)
                 setattr(status, "source_url", None)
         elif k == "retweeted_status":
             setattr(status, k, Status.parse(api, v))
         elif k == "place":
             if v is not None:
                 setattr(status, k, Place.parse(api, v))
             else:
                 setattr(status, k, None)
         else:
             setattr(status, k, v)
     return status
Пример #4
0
 def parse(cls, api, json):
     result = cls()
     for k, v in json.items():
         if k == "created_at":
             setattr(result, k, parse_search_datetime(v))
         elif k == "source":
             setattr(result, k, parse_html_value(unescape_html(v)))
         else:
             setattr(result, k, v)
     return result
Пример #5
0
 def parse(cls, api, json):
     result = cls()
     for k, v in json.items():
         if k == 'created_at':
             setattr(result, k, parse_search_datetime(v))
         elif k == 'source':
             setattr(result, k, parse_html_value(unescape_html(v)))
         else:
             setattr(result, k, v)
     return result
Пример #6
0
 def parse(cls, api, json):
     status = cls(api)
     for k, v in json.items():
         if k == 'user':
             user = User.parse(api, v)
             setattr(status, 'author', user)
             setattr(status, 'user', user)  # DEPRECIATED
         elif k == 'created_at':
             setattr(status, k, parse_datetime(v))
         elif k == 'source':
             if '<' in v:
                 setattr(status, k, parse_html_value(v))
                 setattr(status, 'source_url', parse_a_href(v))
             else:
                 setattr(status, k, v)
         elif k == 'retweeted_status':
             setattr(status, k, User.parse(api, v))
         else:
             setattr(status, k, v)
     return status
Пример #7
0
 def parse(cls, api, json):
     status = cls(api)
     for k, v in json.items():
         if k == 'user':
             user = User.parse(api, v)
             setattr(status, 'author', user)
             setattr(status, 'user', user)  # DEPRECIATED
         elif k == 'created_at':
             setattr(status, k, parse_datetime(v))
         elif k == 'source':
             if '<' in v:
                 setattr(status, k, parse_html_value(v))
                 setattr(status, 'source_url', parse_a_href(v))
             else:
                 setattr(status, k, v)
         elif k == 'retweeted_status':
             setattr(status, k, Status.parse(api, v))
         else:
             setattr(status, k, v)
     return status
Пример #8
0
    def on_data(self, data):
        statuse = json.loads(data)
        if 'delete' in statuse:
            return True  # keep stream alive
        if 'id' in statuse:
            statuse_quoted_text = None
            geoloc = None
            url_expanded = None
            url_media = None
            type_media = None
            text = None
            location = None
            description = None
            name = None
            date = parse_datetime(statuse['created_at'])
            app = parse_html_value(statuse['source'])
            entities = None
            relation = None
            quoted_id = None
            replied_id = None
            retweeted_id = None
            user_replied = None
            user_quoted = None
            user_retweeted = None
            first_HT = None
            #get interactions Ids
            try:
                id_tweet = statuse['id_str']
                if statuse['in_reply_to_status_id_str'] != None:
                    relation = 'reply'
                    replied_id = statuse['in_reply_to_status_id_str']
                    user_replied = statuse['in_reply_to_screen_name']
                if 'quoted_status' in statuse:
                    relation = 'quote'
                    quoted_id = statuse['quoted_status_id_str']
                    user_quoted = statuse['quoted_status']['user'][
                        'screen_name']
                elif 'retweeted_status' in statuse:
                    relation = 'RT'
                    retweeted_id = statuse['retweeted_status']['id_str']
                    user_retweeted = statuse['retweeted_status']['user'][
                        'screen_name']
                    if 'quoted_status' in statuse['retweeted_status']:
                        quoted_id = statuse['retweeted_status'][
                            'quoted_status']['id_str']
                        user_quoted = statuse['retweeted_status'][
                            'quoted_status']['user']['screen_name']
            except:
                text_error = '---------------->bad interactions ids, id tweet %s at %s\n' % (
                    id_tweet, time.asctime())
                self.f_log.write(text_error)
#get geolocation
            if 'coordinates' in statuse:
                coordinates = statuse['coordinates']
                if coordinates != None:
                    try:
                        if 'coordinates' in coordinates:
                            list_geoloc = coordinates['coordinates']
                            print list_geoloc
                            geoloc = '%s, %s' % (list_geoloc[0],
                                                 list_geoloc[1])
                    except:
                        text_error = '---------------->bad coordinates, id tweet %s at %s\n' % (
                            id_tweet, datetime.datetime.now())
                        self.f_log.write(text_error)
#get entities
            if 'entities' in statuse:
                entities = statuse['entities']
            if 'extended_tweet' in statuse:
                entities = statuse['extended_tweet']['entities']
            if 'retweeted_status' in statuse:
                if 'entities' in statuse['retweeted_status']:
                    entities = statuse['retweeted_status']['entities']
                if 'extended_tweet' in statuse['retweeted_status']:
                    entities = statuse['retweeted_status']['extended_tweet'][
                        'entities']
            if entities != None:
                try:
                    urls = entities['urls']
                    if len(urls) > 0:
                        url_expanded = urls[0]['expanded_url']
                except:
                    text_error = '---------------->bad enttity urls, id tweet %s at %s\n' % (
                        id_tweet, datetime.datetime.now())
                    self.f_log.write(text_error)
                try:
                    if 'media' in entities:
                        list_media = entities['media']
                        if len(list_media) > 0:
                            url_media = list_media[0]['media_url']
                            type_media = list_media[0]['type']
                except:
                    text_error = '---------------->bad entity media, at %s id tweet %s \n' % (
                        datetime.datetime.now(), id_tweet)
                    self.f_log.write(text_error)
                try:
                    if 'hashtags' in entities:
                        HTs = entities['hashtags']
                        if len(HTs) > 0:
                            first_HT = HTs[0]['text']
                except:
                    text_error = '---------------->bad entity HT, id tweet %s at %s\n' % (
                        id_tweet, time.asctime())
                    self.f_log.write(text_error)
#get text
            try:
                if 'text' in statuse:
                    text = re.sub('[\r\n\t]+', ' ', statuse['text'])
                if 'extended_tweet' in statuse:
                    text = re.sub('[\r\n\t]+', ' ',
                                  statuse['extended_tweet']['full_text'])
                if 'retweeted_status' in statuse:
                    statuse_RT = statuse['retweeted_status']
                    if 'text' in statuse_RT:
                        RT_expand = re.sub('[\r\n\t]+', ' ',
                                           statuse_RT['text'])
                    if 'extended_tweet' in statuse_RT:
                        extended_RT = statuse_RT['extended_tweet']
                        RT_expand = re.sub('[\r\n\t]+', ' ',
                                           extended_RT['full_text'])
                    RT = re.match(r'(^RT @\w+: )', text)
                    if RT:
                        text = RT.group(1) + RT_expand
            except:
                text_error = '---------------->bad tweet text,  at %s id tweet %s \n' % (
                    datetime.datetime.now(), id_tweet)
                self.f_log.write(text_error)
#get quoted if exist
            try:
                if 'quoted_status' in statuse:
                    if 'text' in statuse['quoted_status']:
                        statuse_quoted_text = statuse['quoted_status']['text']
                    if 'extended_tweet' in statuse['quoted_status']:
                        statuse_quoted_text = statuse['quoted_status'][
                            'extended_tweet']['full_text']
                    statuse_quoted_text = re.sub('[\r\n\t]+', ' ',
                                                 statuse_quoted_text)
                elif 'retweeted_status' in statuse:
                    if 'quoted_status' in statuse['retweeted_status']:
                        if 'text' in statuse['retweeted_status'][
                                'quoted_status']:
                            statuse_quoted_text = statuse['retweeted_status'][
                                'quoted_status']['text']
                        if 'extended_tweet' in statuse['retweeted_status'][
                                'quoted_status']:
                            statuse_quoted_text = statuse['retweeted_status'][
                                'quoted_status']['extended_tweet']['full_text']
                        statuse_quoted_text = re.sub('[\r\n\t]+', ' ',
                                                     statuse_quoted_text)
            except:
                text_error = '---------------->bad quoted,  at %s id tweet %s \n' % (
                    datetime.datetime.now(), id_tweet)
                self.f_log.write(text_error)


#get user profile
            if 'user' in statuse:
                try:
                    if 'location' in statuse['user']:
                        if statuse['user']['location'] != None:
                            location = re.sub('[\r\n\t]+', ' ',
                                              statuse['user']['location'],
                                              re.UNICODE)
                except:
                    text_error = '---------------->bad user location:%s ,  at %s id tweet %s \n' % (
                        datetime.datetime.now(), statuse['user']['location'],
                        id_tweet)
                    self.f_log.write(text_error)
                try:
                    if 'description' in statuse['user']:
                        if statuse['user']['description'] != None:
                            description = re.sub(
                                '[\r\n\t]+', ' ',
                                statuse['user']['description'], re.UNICODE)
                except:
                    text_error = '---------------->bad user description,  at %s id tweet %s \n' % (
                        datetime.datetime.now(), id_tweet)
                    self.f_log.write(text_error)
                try:
                    if 'name' in statuse['user']:
                        if statuse['user']['name'] != None:
                            name = re.sub('[\r\n\t]+', ' ',
                                          statuse['user']['name'], re.UNICODE)
                except:
                    text_error = '---------------->bad user name,  at %s id tweet %s \n' % (
                        datetime.datetime.now(), id_tweet)
                    self.f_log.write(text_error)
            try:
                link_tweet = 'https://twitter.com/%s/status/%s' % (
                    statuse['user']['screen_name'], id_tweet)
                tweet = '%s\t%s\t@%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
                    id_tweet, date, statuse['user']['screen_name'], text, app,
                    statuse['user']['id'], statuse['user']['followers_count'],
                    statuse['user']['friends_count'],
                    statuse['user']['statuses_count'], location, url_expanded,
                    geoloc, name, description, url_media, type_media,
                    statuse_quoted_text, relation, replied_id, user_replied,
                    retweeted_id, user_retweeted, quoted_id, user_quoted,
                    first_HT, statuse['lang'],
                    parse_datetime(statuse['user']['created_at']),
                    statuse['user']['verified'],
                    statuse['user']['profile_image_url_https'], link_tweet)
                self.f_out.write(tweet)
                print '---->collected tweet', id_tweet
            except:
                text_error = '---------------> format error  at %s, id-tweet %s\n' % (
                    datetime.datetime.now(), id_tweet)
                self.f_log.write(text_error)
                pass
        else:
            text_error = '---------------> message no expected  %s,  %s\n' % (
                datetime.datetime.now(), data)
            self.f_log.write(text_error)
        return True  # keep stream alive
Пример #9
0
 def on_data(self, data):
     statuse = json.loads(data)
     if 'delete' in statuse:
         return True  # keep stream alive
     if 'id' in statuse:
         statuse_quoted_text = None
         geoloc = None
         url_expanded = None
         url_media = None
         type_media = None
         text = None
         location = None
         description = None
         name = None
         date = None
         app = None
         try:
             id_tweet = statuse['id']
             recent_tweet = id_tweet
             profile_user = statuse['user']
             if 'quoted_status_id' in statuse:
                 print statuse['quoted_status_id']
                 if 'quoted_status' in statuse:
                     statuse_quoted = statuse['quoted_status']
                     if 'text' in statuse_quoted:
                         statuse_quoted_text = statuse_quoted['text']
                         statuse_quoted_text = re.sub(
                             '[\r\n\t]+', ' ', statuse_quoted_text)
                         print 'tweet nested', statuse_quoted_text
             if 'coordinates' in statuse:
                 coordinates = statuse['coordinates']
                 if coordinates != None:
                     list_geoloc = coordinates['coordinates']
                     geoloc = '%s, %s' % (list_geoloc[0], list_geoloc[1])
             if 'entities' in statuse:
                 entities = statuse['entities']
                 urls = entities['urls']
                 if len(urls) > 0:
                     url = urls[0]
                     url_expanded = url['expanded_url']
             text = re.sub('[\r\n\t]+', ' ', statuse['text'])
             if profile_user['location'] != None:
                 location = re.sub('[\r\n\t]+', ' ',
                                   profile_user['location'], re.UNICODE)
             if profile_user['description'] != None:
                 description = re.sub('[\r\n\t]+', ' ',
                                      profile_user['description'],
                                      re.UNICODE)
             if profile_user['name'] != None:
                 name = re.sub('[\r\n\t]+', ' ', profile_user['name'],
                               re.UNICODE)
             date = parse_datetime(statuse['created_at'])
             app = parse_html_value(statuse['source'])
             tweet = '%s\t%s\t@%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
                 id_tweet, date, profile_user['screen_name'], text, app,
                 profile_user['id'], profile_user['followers_count'],
                 profile_user['friends_count'],
                 profile_user['statuses_count'], location, url_expanded,
                 geoloc, name, description, url_media, type_media,
                 statuse_quoted_text)
             self.f_out.write(tweet)
             print '---->collected tweet', id_tweet
         except:
             text_error = '---------------> parser error  at %s, id-tweet %s\n' % (
                 datetime.datetime.now(), statuse)
             self.f_log.write(text_error)
             pass
     else:
         text_error = '---------------> message no expected  %s,  %s\n' % (
             datetime.datetime.now(), data)
         self.f_log.write(text_error)
     return True  # keep stream alive