示例#1
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        data = json.loads(HTMLParser().unescape(raw_data))

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
示例#2
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        data = json.loads(HTMLParser().unescape(raw_data))

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
示例#3
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.
        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        try:
            data = json.loads(raw_data)

            if 'in_reply_to_status_id' in data:
                status = Status.parse(self.api, data)
                if self.on_status(status) is False:
                    return False
            elif 'delete' in data:
                delete = data['delete']['status']
                if self.on_delete(delete['id'], delete['user_id']) is False:
                    return False
            elif 'event' in data:
                status = Status.parse(self.api, data)
                if self.on_event(status) is False:
                    return False
            elif 'direct_message' in data:
                status = Status.parse(self.api, data)
                if self.on_direct_message(status) is False:
                    return False
            elif 'friends' in data:
                if self.on_friends(data['friends']) is False:
                    return False
            elif 'limit' in data:
                if self.on_limit(data['limit']['track']) is False:
                    return False
            elif 'disconnect' in data:
                if self.on_disconnect(data['disconnect']) is False:
                    return False
            elif 'warning' in data:
                if self.on_warning(data['warning']) is False:
                    return False
            elif 'scrub_geo' in data:
                if self.on_scrub_geo(data['scrub_geo']) is False:
                    return False
            elif 'status_withheld' in data:
                if self.on_status_withheld(data['status_withheld']) is False:
                    return False
            elif 'user_withheld' in data:
                if self.on_user_withheld(data['user_withheld']) is False:
                    return False
            else:
                insert_logger.error("Unknown message type: %s", raw_data)
        except IncompleteRead as e:
            insert_logger.exception(str(e))
            time.sleep(5)
            return True
示例#4
0
    def on_data(self, raw_data):
        data = json.loads(raw_data)

        if self.my_screen_name == data['user']['screen_name']:
            return True

        try:
            data['tweet_text'] = data['extended_tweet']['full_text']
        except KeyError:
            try:
                data['tweet_text'] = data['text']
            except KeyError:
                data['tweet_text'] = u''

        if 'retweeted_status' in data:
            self.logger.info('retweet detected')
            status = Status.parse(self.api, data)
            if self.on_status(status, is_retweet=True) is False:
                return False
        elif 'in_reply_to_status_id' in data:
            self.logger.info('in_reply_to_status_id')
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False
        else:
            self.logger.error('Unknown message type: %s', str(raw_data))
示例#5
0
    def on_data(self, raw_data):
        """This is called when raw data is received from the stream.
        This method handles sending the data to other methods, depending on the
        message type.

        https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/guides/streaming-message-types
        """
        data = json.loads(raw_data)

        if "in_reply_to_status_id" in data:
            status = Status.parse(None, data)
            return self.on_status(status)
        if "delete" in data:
            delete = data["delete"]["status"]
            return self.on_delete(delete["id"], delete["user_id"])
        if "disconnect" in data:
            return self.on_disconnect_message(data["disconnect"])
        if "limit" in data:
            return self.on_limit(data["limit"]["track"])
        if "scrub_geo" in data:
            return self.on_scrub_geo(data["scrub_geo"])
        if "status_withheld" in data:
            return self.on_status_withheld(data["status_withheld"])
        if "user_withheld" in data:
            return self.on_user_withheld(data["user_withheld"])
        if "warning" in data:
            return self.on_warning(data["warning"])

        log.error("Received unknown message type: %s", raw_data)
示例#6
0
 def on_data(self, data):
     if time.time() >= self.started + self.duration:
         stats = open('{0}-sample.stats'.format(int(self.started)), 'w+')
         stats.write("================= STATISTICS =================" + "\n")
         stats.write("Start time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n")
         stats.write("End time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n")
         stats.write("First Tweet ID: " + self.first_tweet_id + "\n")
         stats.write("Last Tweet ID: " + self.last_tweet_id + "\n")
         stats.write("Language: " + self.lang + "\n")
         stats.write("Language classification threshold: " + str(self.lang_threshold) + "\n")
         stats.write("Above threshold: " + str(self.counter[self.lang + '-above']) + "\n")
         stats.write("Below threshold: " + str(self.counter[self.lang + '-below']) + "\n")
         stats.write("Exluded: " + str(self.counter['excluded']) + "\n")
         return False
     elif 'in_reply_to_status_id' in data: 
         status = Status.parse(self.api, json.loads(data))
         langclass = langid.classify(status.text)
         
         if (self.counter == {self.lang + '-above':0, self.lang + '-below':0, 'excluded':0}):
             self.first_tweet_id = str(status.id)
         self.last_tweet_id = str(status.id)
         
         if (langclass[0] == self.lang):                
             if langclass[1] >= self.lang_threshold:
                 self.above_output.write(data)
                 self.counter[self.lang + '-above'] += 1
             else:
                 self.below_output.write(data)
                 self.counter[self.lang + '-below'] += 1
         else:
             self.excl_output.write(data)
             self.counter['excluded'] += 1
            
         return True
示例#7
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        data = json.loads(raw_data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            return self.on_status(status)
        if 'delete' in data:
            delete = data['delete']['status']
            return self.on_delete(delete['id'], delete['user_id'])
        if 'limit' in data:
            return self.on_limit(data['limit']['track'])
        if 'disconnect' in data:
            return self.on_disconnect(data['disconnect'])
        if 'warning' in data:
            return self.on_warning(data['warning'])
        if 'scrub_geo' in data:
            return self.on_scrub_geo(data['scrub_geo'])
        if 'status_withheld' in data:
            return self.on_status_withheld(data['status_withheld'])
        if 'user_withheld' in data:
            return self.on_user_withheld(data['user_withheld'])

        log.error("Unknown message type: %s", raw_data)
示例#8
0
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """

        if '{"delete"' in data:
            try:
                delete = json.loads(data)['delete']['status']
                if self.on_delete(delete['id'], delete['user_id']) is False:
                    return False
            except:
                delete = json.loads(data)['delete']['direct_message']
                if self.on_direct_message_delete(delete['id'], delete['user_id']) is False:
                    return False
        elif '{"direct_message"' in data:
            message = DirectMessage.parse(self.api, json.loads(data)['direct_message'])
            if self.on_direct_message(message) is False:
                return False
        elif '{"target"' in data:
            event = json.loads(data)
            if self.on_event(event) is False:
                return False
        elif '{"limit"' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
        elif '"in_reply_to_user_id_str"' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
示例#9
0
    def on_data(self, data):

        if self.print_data:
            print(data)

        self._print_status(Status.parse(self.api, self.json.loads(data)))
        sleep(self.delay)
示例#10
0
def test_end_to_end(filename, connections, expected, tmpdir):
    api = MockAPI(connections=connections)

    with open(filename, 'r') as f:
        status = Status.parse(api, json.load(fp=f))

    l = LessListener(api=api, post_replies=True, gather='tweets', state_dir=str(tmpdir))

    # 100% festivity for all of December
    l.december_greetings = ('It is cold outside.',)
    l.festive_probability = 1.
    assert l.get_festive_probability(dt.date(2016, 12, 5)) == 1.

    l.on_status(status)

    # Never reply to the same toot twice
    l.on_status(status)

    # Rate-limit replies for same word
    setattr(status, 'id', status.id + 1)
    l.on_status(status)

    if expected is None:
        assert api._updates == []
    else:
        assert len(api._updates) == 1
        u = api._updates[0]
        assert u['status'] == expected

    for k, before in connections.items():
        after = api._connections[k]
        assert ('following' in after) == ('followed_by' in before), \
            (k, before, after)
 def process(self, tweet):
     status = Status.parse(api, json.loads(tweet))
     for lf in UNICODE_LINES:
         text = status.text.replace(lf, ' ')
     print "@%s (%s, %s, %s, %s): %s"%(status.user.screen_name, 
         status.user.lang, status.user.statuses_count, status.user.friends_count, 
         status.user.followers_count, text)
示例#12
0
文件: listen.py 项目: orygens/bot_crm
 def on_data(self, data):
     if "entities" in data:
         data = json.loads(data)
         user_mentions = data["entities"]["user_mentions"]
         screen_names = [mention["screen_name"] for mention in user_mentions]
         if "testeMagazine" in screen_names:
             status = Tweet.parse(self.api, data)
             self.on_mention(status)
示例#13
0
def get(name, mx=-1):
    ss = []
    with open("%s%s%s" % (_prefix, name, _suffix)) as f:
        for i, l in enumerate(f):
            if mx > 0 and i > mx:
                break
            ss.append(Status.parse(None, loads(l)))
    return ss
示例#14
0
  def on_data(self, raw_data):
    """Called when raw data is received from connection.

    This is where all the data comes first. Normally we could use (inherit)
    the on_data() in tweepy.StreamListener, but it unnecessarily and naively
    reports unknown event types as errors (to simple log); also, we might want
    to tweak it further later on.

    But for now, this is basically taken from tweepy's on_data().

    Return False to stop stream and close connection.
    """

    self.processing_data = True

    data = json.loads(raw_data)

    if 'in_reply_to_status_id' in data:
      status = Status.parse(self.api, data)
      if self.on_status(status) is False:
        return False
    elif 'delete' in data:
      delete = data['delete']['status']
      if self.on_delete(delete['id'], delete['user_id']) is False:
        return False
    elif 'event' in data:
      status = Status.parse(self.api, data)
      if self.on_event(status) is False:
        return False
    elif 'direct_message' in data:
      status = Status.parse(self.api, data)
      if self.on_direct_message(status) is False:
        return False
    elif 'limit' in data:
      if self.on_limit(data['limit']['track']) is False:
        return False
    elif 'disconnect' in data:
      if self.on_disconnect(data['disconnect']) is False:
        return False
    else:
      log.debug('TwitterBotStreamListener::on_data(): got event/stream data of'
          ' unknown type. Raw data follows:\n%s', data)

    self.processing_data = False
示例#15
0
def test_sanitize(filename, expected):
    api = NonCallableMock()

    with open(os.path.join('tests', filename), 'r') as f:
        status = Status.parse(api, json.load(f))

    text = get_sanitized_text(status)
    assert '&' not in text
    assert 'http' not in text
    assert text == expected
示例#16
0
 def _read_from_table(self):
     self.running = True
     conn = StatusSource.engine.connect()
     meta = MetaData()
     table = Table(self.table_name, meta, autoload=True, autoload_with=StatusSource.engine)
     cmd = select([table])
     results = conn.execute(cmd)
     for result in results:
         status = Status.parse(None, result)
         self.listener.on_status(status)
         if self.running == False:
             break
示例#17
0
    def on_data(self, raw_data):
        # called on recieval of raw data
        data = json.loads(raw_data)

        # start of if tree
        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
示例#18
0
    def test_patched_status(self):
        """@todo: Docstring for test_patched_status.
        :returns: @todo

        """
        from tweepy.models import Status
        from crawler.tweepy_patch import patch
        patch()
        s = Status.parse('test_api', {'a': 1, 'b': 2})
        # pylint: disable=E1101,W0212
        self.assertEqual(s._raw, '{"a": 1, "b": 2}')
        self.assertEqual(s.a, 1)
        self.assertEqual(s.b, 2)
示例#19
0
def test_save_tweet(tmpdir, id_, expected_filename):
    api = MockAPI(connections={})
    foo = tmpdir.join('foo')

    l = LessListener(api=api, gather=str(foo), state_dir=str(tmpdir))
    s = Status.parse(api=api, json={
        'id': int(id_),
        'id_str': id_,
    })
    l.save_tweet(s)

    j = tmpdir.join('foo', expected_filename)
    assert j.check()
示例#20
0
    def on_data(self, data):

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status, data) is False:
                return False
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
示例#21
0
 def on_data(self, data):
     
     if 'in_reply_to_status_id' in data:
         status = Status.parse(self.api, json.loads(data))
         if self.on_status(status, data) is False:
             return False
     elif 'delete' in data:
         delete = json.loads(data)['delete']['status']
         if self.on_delete(delete['id'], delete['user_id']) is False:
              return False
     elif 'limit' in data:
         if self.on_limit(json.loads(data)['limit']['track']) is False:
             return False
示例#22
0
    def on_data(self, raw_data):
        data = json.loads(raw_data)
        if self.verbose:
            print data
            print '-' * 60

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'friends' in data:
            pass  # ignore
        elif 'delete' in data:
            pass  # ignore
        elif 'user_suspend' in data:
            pass  # ignore
        else:
            logging.error("Unknown message type: " + str(raw_data))
示例#23
0
    def on_data(self, raw_data):
        data = json.loads(raw_data)
        if self.verbose:
            print data
            print '-'*60

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'friends' in data:
            pass # ignore
        elif 'delete' in data:
            pass # ignore
        elif 'user_suspend' in data:
            pass # ignore
        else:
            logging.error("Unknown message type: " + str(raw_data))
示例#24
0
    def save_status(self, data):
        """TODO"""
        status = Status.parse(self.api, json.loads(data))

        if not status.geo:
            # _datafile.write(data+'\n')
            return

        if Author.objects.filter(owner__userprofile__twitter_id=status.user.id_str).exists():
            # this tweet's author is on stargazer
            return

        try:
            author = Author.objects.filter(source=Author.T_TWITTER, external_id=status.user.id_str).get()
        except Author.DoesNotExist:
            author = Author(
                name=status.user.screen_name,
                avatar_uri=status.user.profile_image_url,
                source=Author.T_TWITTER,
                external_id=status.user.id_str,
            )
            author.save()

        try:
            post = Post.objects.filter(source=Post.T_TWITTER, external_id=status.id_str).get()
        except Post.DoesNotExist:
            lat = float(status.geo["coordinates"][0])
            lng = float(status.geo["coordinates"][1])

            try:
                addr = self._latlng2addr.get(lat, lng)
            except (LatLng2Addr.ConnectionFailed, LatLng2Addr.GeocodingFailed) as e:
                addr = ""

            # twitter api response in UTC
            created = status.created_at + timedelta(hours=8)

            post = Post(
                content=status.text,
                author=author,
                latitude=lat,
                longitude=lng,
                address=addr,
                source=Post.T_TWITTER,
                external_id=status.id_str,
                external_data=data,
                created=created,
            )
            post.save()

        return
示例#25
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        data = json.loads(raw_data)

        if "in_reply_to_status_id" in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif "delete" in data:
            delete = data["delete"]["status"]
            if self.on_delete(delete["id"], delete["user_id"]) is False:
                return False
        elif "event" in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif "direct_message" in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif "friends" in data:
            if self.on_friends(data["friends"]) is False:
                return False
        elif "limit" in data:
            if self.on_limit(data["limit"]["track"]) is False:
                return False
        elif "disconnect" in data:
            if self.on_disconnect(data["disconnect"]) is False:
                return False
        elif "warning" in data:
            if self.on_warning(data["warning"]) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
示例#26
0
    def post_tweet(self, media_id, status, in_reply_to_status_id):
        request_data = {
            'status': status,
            'media_ids': media_id,
            'in_reply_to_status_id': in_reply_to_status_id
        }

        req = self.post(url=POST_TWEET_URL,
                        data={
                            key: val
                            for key, val in request_data.items()
                            if val is not None
                        })
        return Status.parse(self.api, req.json())
示例#27
0
 def __init__(self, tweetDict):
     self.tweet = Status.parse(API(), tweetDict["tweet"])
     try:
         self.keywords = tweetDict["keywords"]
     except KeyError:
         pass
     try:
         self.groups = tweetDict["groups"]
     except KeyError:
         pass
     self.tokens = []
     self.filt_tokens = []
     for token in tweetDict["tokens"]:
         t = Token(token)
         self.tokens.append(t)
         if not t.filter_token():
             self.filt_tokens.append(t)
示例#28
0
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
示例#29
0
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
示例#30
0
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """

        if "in_reply_to_status_id" in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif "delete" in data:
            delete = json.loads(data)["delete"]["status"]
            if self.on_delete(delete["id"], delete["user_id"]) is False:
                return False
        elif "limit" in data:
            if self.on_limit(json.loads(data)["limit"]["track"]) is False:
                return False
示例#31
0
    def on_data(self, data):
        if time.time() >= self.started + self.duration:
            stats = open('{0}-sample.stats'.format(int(self.started)), 'w+')
            stats.write("================= STATISTICS =================" +
                        "\n")
            stats.write("Start time: " + time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n")
            stats.write("End time: " + time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n")
            stats.write("First Tweet ID: " + self.first_tweet_id + "\n")
            stats.write("Last Tweet ID: " + self.last_tweet_id + "\n")
            stats.write("Language: " + self.lang + "\n")
            stats.write("Language classification threshold: " +
                        str(self.lang_threshold) + "\n")
            stats.write("Above threshold: " +
                        str(self.counter[self.lang + '-above']) + "\n")
            stats.write("Below threshold: " +
                        str(self.counter[self.lang + '-below']) + "\n")
            stats.write("Exluded: " + str(self.counter['excluded']) + "\n")
            return False
        elif 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            langclass = langid.classify(status.text)

            if (self.counter == {
                    self.lang + '-above': 0,
                    self.lang + '-below': 0,
                    'excluded': 0
            }):
                self.first_tweet_id = str(status.id)
            self.last_tweet_id = str(status.id)

            if (langclass[0] == self.lang):
                if langclass[1] >= self.lang_threshold:
                    self.above_output.write(data)
                    self.counter[self.lang + '-above'] += 1
                else:
                    self.below_output.write(data)
                    self.counter[self.lang + '-below'] += 1
            else:
                self.excl_output.write(data)
                self.counter['excluded'] += 1

            return True
 def process(self, tweet):
     status = Status.parse(api, json.loads(tweet))
     out = {"screen_name": status.user.screen_name, 
         "id": status.id,
         "lang": status.user.lang, 
         "statuses_count": status.user.statuses_count, 
         "friend_count": status.user.friends_count, 
         "followers_count":status.user.followers_count,
         "profile_image_url": status.user.profile_image_url,
         "text": status.text.encode('utf8'),
         "entities": status.entities,
         "created_at": status.created_at.strftime("%Y-%m-%d %H:%M:%S"),
         "geo":status.geo,
         "location":status.user.location,
         "timezone":status.user.time_zone}
     now = time.strftime(self.fmt)
     if now != self.time:
         self.time = str(now)
         self.fid.close()
         self.fid = gzip.open(os.path.join(self.path, self.base + '-' + self.time + '.txt.gz'), 'ab')
     self.fid.write(json.dumps(out) + '\n')
示例#33
0
    def _get_status(self, data):

        status = Status.parse(self.api, self.json.loads(data))

        if status.user.screen_name in self.block_users:
            raise TweepError(">> User ignored: @%s" % status.user.screen_name)
        try:
            status = status.retweeted_status
        except AttributeError as atr:
            if not self.original:
                text = self._proccess_status(status.text)
                trunc_text = (text[:72] + '...') if len(text) > 75 else text
                raise TweepError(">> Original tweet ignored: %s" % trunc_text)

        if status.is_quote_status:
            if self.quoted:
                status = status.quoted_status
            else:
                text = self._proccess_status(status.text)
                raise TweepError(">> Quoted tweet ignored: %s" % text)

        return status
示例#34
0
  def on_data(self, data):
    """Called when raw data is received from connection.

    Override this method if you wish to manually handle
    the stream data. Return False to stop stream and close connection.
    """

    if 'in_reply_to_status_id' in data:
      status = Status.parse(self.api, json.loads(data))
      return self.on_status(status)
    elif 'delete' in data:
      delete = json.loads(data)['delete']['status']
      if self.on_delete(delete['id'], delete['user_id']) is False:
        return False
    elif 'limit' in data:
        if self.on_limit(json.loads(data)['limit']['track']) is False:
          return False
    elif 'sender_id' in data and 'recipient_id' in data:
      dm = DirectMessage.parse(self.api, json.loads(data))
      return self.on_dm(dm)
    elif 'event' in data and 'follow' in data:
      content = json.loads(data)
      if 'event' in content and content['event'] == 'follow':
        return self.on_follow(content)
示例#35
0
def gen_tuple(jsontweet):
    tweet = Status.parse(api, json.loads(jsontweet))
    retweeted = (getattr(tweet, 'retweeted_status', None) != None)
    return (tweet.author.id, tweet.created_at, convert_to_utf8_str(tweet.text), retweeted)
示例#36
0
	def update_tweets(self):
		print "Updating tweets"

		statuses = []
		try:
			while True:
				item = self.incoming.pop() # It's gonna throw up someday!
				if "in_reply_to_status_id" in item:
					statuses.append(Status.parse(self.stream.api, json.loads(item)))
				# Ignore anything other than status updates for now
				#else:
				#	statuses.append(json.loads(item))
		except IndexError:
			pass
		
		broadcast = {}
		broadcast['general'] = {}
		broadcast['channels'] = {}
				
		for s in statuses:
			tags = re.findall("#([\w]+)(?iu)", s.text) # Case-insensitive, Unicode matching
			print "Tags: "
			print tags
			self.db.execute("INSERT INTO tweets (id, user_id, screen_name, profile_image_url, created_at, text) VALUES (%s,%s,%s,%s,%s,%s)", s.id, s.user.id, s.user.screen_name, s.user.profile_image_url, s.created_at, s.text)

			# Establish HABTM relationships, tweets with tags
			for t in tags:
				t = t.lower() # Force all to lowercase
				print "Inserting tag: %s" % t
				self.db.execute('''INSERT INTO hashtags (tag) VALUES (%s) ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), tag=%s; 
					INSERT INTO hashtags_tweets (hash_id, tweet_id) VALUES (LAST_INSERT_ID(), %s)''', t, t, s.id)
				
				# Count the votes while we're at it
				if t in campboard['sessions']:
					
					# Attach the tweet to the broadcast channel
					if not broadcast['channels'].has_key(t):
						broadcast['channels'][t] = {}
					
					broadcast['channels'][t]['recent_tweets'] = []
					broadcast['channels'][t]['recent_tweets'].append(
						{
							'text': s.text, 'created_at': unicode(s.created_at), 'id': s.id,
							'user': {
								'id': s.user.id,
								'screen_name': s.user.screen_name,
								'profile_image_url': s.user.profile_image_url
							}				
						}
					)
			
					vote_type = None
					if re.search('\+1', s.text):
						#vote_type = "positive"
						self.db.execute("INSERT INTO session_votes (`session`, positive) VALUES (%s, 1) ON DUPLICATE KEY UPDATE positive=positive+1", t)
					elif re.search('\-1', s.text):
						#vote_type = "negative"
						self.db.execute("INSERT INTO session_votes (`session`, negative) VALUES (%s, 1) ON DUPLICATE KEY UPDATE negative=negative+1", t)
		

		broadcast['general']['recent_tweets'] = [
			{
				'text': s.text, 'created_at': unicode(s.created_at), 'id': s.id,
				'user': {
					'id': s.user.id,
					'screen_name': s.user.screen_name,
					'profile_image_url': s.user.profile_image_url
				}
			}
			for s in statuses
		]
	
		return broadcast
示例#37
0
    hashtag = 0
    url = 0
    question = 0
    exclamation = 0
    pos_term = 0
    neg_term = 0
    pos_emoticon = 0
    neg_emoticon = 0
    reply = 0
    moment_morning = 0
    moment_afternoon = 0
    moment_evening = 0
    moment_night = 0
    retweeted = 0

    status = Status.parse(api, json.loads(tweet[0]))

    if status.id in error_list_tweet_ids:
        tweets_discarded_error += 1
    elif status.text.startswith("RT @"):
        tweets_discarded_retweet += 1
    else:
        tweets_considered += 1
        if regex_username.search(status.text) != None:
            tweets_username += 1
            username = 1
        if regex_hashtag.search(status.text) != None:
            tweets_hashtag += 1
            hashtag = 1
        if regex_url.search(status.text) != None:
            tweets_url += 1
示例#38
0
from tweepy.models import Status

from teebr.text.utils import normalize_text
from teebr.features import filter_status

CLUSTERS = 40
DIMS = 100

tweets = []

#tw_count = 0

with open("raw_tweets.jsons") as f:
    for line in f:
        j = loads(line)
        t = Status.parse(None, j)
        if filter_status(t):
            tweet = normalize_text(t.text)
            tweets.append(tweet)
            #tw_count += 1
            #if tw_count >= 2000:
            #    break

# less tweets for the tests
#tweets = tweets[:10000]

print "tweets: %d" % len(tweets)

#hasher = HashingVectorizer(stop_words='english', non_negative=True, norm=None)
#vectorizer = make_pipeline(hasher, TfidfTransformer())
示例#39
0
    hashtag = 0
    url = 0
    question = 0
    exclamation = 0
    pos_term = 0
    neg_term = 0
    pos_emoticon = 0
    neg_emoticon = 0
    reply = 0
    moment_morning = 0
    moment_afternoon = 0
    moment_evening = 0
    moment_night = 0
    retweeted = 0

    status = Status.parse(api, tweet)

    if tweet['id'] in error_list_tweet_ids:
        tweets_discarded_error += 1
    elif tweet['text'].startswith("RT @"):
        tweets_discarded_retweet += 1
    else:
        tweets_considered += 1
        if regex_username.search(tweet['text']) != None:
            tweets_username += 1
            username = 1
        if regex_hashtag.search(tweet['text']) != None:
            tweets_hashtag += 1
            hashtag = 1
        if regex_url.search(tweet['text']) != None:
            tweets_url += 1
示例#40
0
 def on_data(self, data):
     """
     Generic class for site streams that just print each
     action that comes in - override these methods to actually
     process them
     """
     if 'for_user' in data:
         parsed_data = json.loads(data)
         user_id = parsed_data['for_user']
         if 'message' in data:
             message = parsed_data['message']
             if u'friends' in message:
                 if self.on_friends(user_id, message['friends']) is False:
                     return False
             elif u'event' in message:
                 if message[u'event'] == u'follow':
                     if self.on_follow(
                         user_id=user_id,
                         source=message[u'source'],
                         target=message[u'target'],
                         time=message[u'created_at']
                     ) is False:
                         return False
                 elif message[u'event'] == u'unfollow':
                     if self.on_unfollow(
                         user_id,
                         source=message[u'source'],
                         target=message[u'target'],
                         time=message[u'created_at']
                     ) is False:
                         return False
                 elif message[u'event'] == u'favorite':
                     if self.on_favorite(
                         user_id,
                         source=message[u'source'],
                         favorited=message[u'target_object'],
                         time=message[u'created_at']
                     ) is False:
                         return False
                 elif message[u'event'] == u'unfavorite':
                     if self.on_unfavorite(
                         user_id,
                         source=message[u'source'],
                         favorited=message[u'target_object']
                     ) is False:
                         return False
             # Need this second check - could be a retweet of
             # a tweet mentioning the user of interest
             elif (u'retweeted_status' in message and
                 int(message[u'retweeted_status'][u'user'][u'id']) ==
                 int(user_id)
             ):
                 if self.on_retweet(user_id, message) is False:
                     return False
             elif u'text' in message:
                 status = Status.parse(self.api, message)
                 # tweet from the user of interest
                 if status.author.id == user_id:
                     if self.on_user_status(user_id, status) is False:
                         return False
                 else:   # tweet mentioning the user of interest
                     if self.on_user_mention(user_id, status) is False:
                         return False
             elif u'direct_message' in message:
                 if self.on_direct_message(
                     user_id, message[u'direct_message']
                 ) is False:
                     return False
             else:
                 print parsed_data
示例#41
0
 def parse_tweet(tweet):
     """ Parse a JSON tweet into a tweepy object and insert missing author. """
     t = Status.parse(self.api, tweet)
     t.author = current_user
     return t
示例#42
0
    def save_tweets(self):
        while True:
            raw_data = self.q.get()

            data = json.loads(raw_data)

            if 'in_reply_to_status_id' in data:
                status = Status.parse(self.api, data)

                is_retweet = False
                retweeted_id = 0
                if hasattr(status, 'retweeted_status'):
                    is_retweet = True
                    retweeted_id = status.retweeted_status.id

                    if hasattr(status.retweeted_status, 'extended_tweet'):
                        text = status.retweeted_status.extended_tweet[
                            'full_text']
                    else:
                        text = status.retweeted_status.text

                else:
                    if hasattr(status, 'extended_tweet'):
                        text = status.extended_tweet['full_text']
                    else:
                        text = status.text

                is_quote = hasattr(status, "quoted_status")
                quoted_text = ""
                quoted_id = 0
                if is_quote:
                    quoted_id = status.quoted_status.id

                    if hasattr(status.quoted_status, "extended_tweet"):
                        quoted_text = status.quoted_status.extended_tweet[
                            "full_text"]
                    else:
                        quoted_text = status.quoted_status.text

                for keyword_obj in self.keyword_obj_list:
                    keyword = keyword_obj.keyword

                    if keyword.lower() in text.lower() or keyword.lower(
                    ) in quoted_text.lower():
                        tweet_obj = Tweet.objects.create(
                            keyword=keyword_obj,
                            tweet_id=status.id,
                            created_at=make_aware(status.created_at),
                            user_id=status.user.id,
                            retweeted_id=retweeted_id,
                            quoted_id=quoted_id,
                            text=text,
                            quoted_text=quoted_text)

                        lang = detect(keyword)
                        if lang == 'en':
                            text = text_utils.pre_process(text)

                        triple_list = knowledge_graph_extract.extract_entity(
                            text, lang=lang)
                        for triple in triple_list:
                            Knowledge.objects.create(tweet=tweet_obj,
                                                     k_subject=triple[0],
                                                     k_predicate=triple[1],
                                                     k_object=triple[2],
                                                     subject_type=triple[3],
                                                     object_type=triple[4])

            self.q.task_done()
 def on_data(self, data):
     tweet = Status.parse(tweepy_api, json.loads(data))
     self.handler(tweet)
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        self.count += 1
        
        data = json.loads(raw_data)
        
        if self.count >50000:
            self.statusf.close()
            self.userf.close()
            self.deletef.close()
            self.count = 0
            ts = time.strftime("./data/%Y%m%d%H%M")
            self.statusf = open(ts+'_status.csv','w',newline='')
            self.statusw = csv.writer(self.statusf)
            self.statusw.writerow(['id', 'created_at', 'coordinates',\
                               'hashtags', 'user_mentions', 'symbols', 'urls', \
                               'media', \
                               'in_reply_to_screen_name', \
                               'in_reply_to_user_id_str', \
                               'in_reply_to_status_id_str', \
                               'place', 'retweeted_status_id', 'source', \
                               'text', 'user id' \
                               # some other attributes exsits, they are list below
                               #, status.withheld_copyright, \#optional
                               #status.withheld_in_countries, \#optional
                               #status.withheld_scope, \#optional
                               #status.truncated, \#default False
                               #status.retweeted, status.retweet_count, \#for no rt
                               #status.scopes, possibly_sensitive, \
                               #status.lang, status.fiter_level, \lang=en
                               #status.favorited, status.favorite_count, \
                               #status.current_user_retweet, \
                               #status.contributors, status.annotations \
                               ])
            self.userf = open(ts+'_user.csv','w',newline='')
            self.userw = csv.writer(self.userf)
            self.userw.writerow(['created_at', 'default_profile', \
                             #user.default_profile_image, \
                             'description', \
                             #user.entities, \
                             'favourites_count', \
                             #user.follow_request_sent, user.following,\#relate to given user
                             'followers_count', 'friends_count', \
                             'geo_enabled', 'id_str', 'is_translator', \
                             'lang', 'listed_count', 'location', \
                             #user.notifications, \
                             'name', \
                             #user.profile_background_color, user.profile_background_image_url, \
                             #user.profile_background_image_url_https, user.profile_background_tile, \
                             #user.profile_banner_url, user.profile_image_url, \
                             #user.profile_image_url_https, user.profile_link_color, \
                             #user.profile_sidebar_border_color, user.profile_sidebar_fill_color, \
                             #user.profile_text_color, user.profile_use_background_image, \
                             'protected', 'screen_name', \
                             #user.show_all_inline_media, user.status, \
                             'statuses_count', 'time_zone', 'user.url', \
                             #user.utc_offset, \
                             #user.withheld_in_countries, user.withheld_scope, 
                             'verified'])
            self.deletef = open(ts+'_delete.csv','w',newline='')
            self.deletew = csv.writer(self.deletef)
            self.deletew.writerow(['status_id','user_id'])

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
            return False
        return True
示例#45
0
    def on_data(self, data):
        full_text = ""

        data2 = json.loads(data)

        if 'extended_tweet' in data2:
            if ('full_text' in data2["extended_tweet"]):

                full_text = bytes(
                    str(data2["extended_tweet"]["full_text"]).encode("utf-8"))
                full_text = full_text.decode('utf-8')
                print(
                    'FUL TEXT *******************************************************************************'
                )
                print(full_text)

            #print(self.find_between( data, '"extended_tweet":{"full_text":"','",'))
            #print(data)
        if ("retweeted_status" in data2):
            if ('full_text' in data2["retweeted_status"]):
                full_text = bytes(
                    str(data2["retweeted_status"]["full_text"]).encode(
                        "utf-8"))
                full_text = full_text.decode('utf-8')
                print(
                    'FUL TEXT *******************************************************************************'
                )
                print(full_text)
        #print(full_text)

        data = json.loads(data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status, full_text) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
示例#46
0
    def setUp(self):
        def load_status():
            with open('./tests/cassettes/sample-tweet.json') as infile:
                status = Status.parse(api=None, json=load(infile))
                return status

        self._status = Status.parse(
            api=None,
            json={
                'created_at': 'Fri Dec 01 01:53:45 +0000 2017',
                'id': 936412976520876032,
                'id_str': '936412976520876032',
                'text': '@realDonaldTrump https://t.co/0BW86RBIRH',
                'display_text_range': [17, 40],
                'source':
                '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
                'truncated': False,
                'in_reply_to_status_id': 936395008139198464,
                'in_reply_to_status_id_str': '936395008139198464',
                'in_reply_to_user_id': 25073877,
                'in_reply_to_user_id_str': '25073877',
                'in_reply_to_screen_name': 'realDonaldTrump',
                'user': {
                    'id': 29363354,
                    'id_str': '29363354',
                    'name': 'Kate',
                    'screen_name': 'k8_doo',
                    'location': 'United States',
                    'url': None,
                    'description':
                    'Follow me if you want to know how far I walked, hiked or ran today for #charitymiles',
                    'translator_type': 'none',
                    'protected': False,
                    'verified': False,
                    'followers_count': 322,
                    'friends_count': 943,
                    'listed_count': 3,
                    'favourites_count': 26916,
                    'statuses_count': 3334,
                    'created_at': 'Tue Apr 07 02:56:52 +0000 2009',
                    'utc_offset': -18000,
                    'time_zone': 'Eastern Time (US & Canada)',
                    'geo_enabled': True,
                    'lang': 'en',
                    'contributors_enabled': False,
                    'is_translator': False,
                    'profile_background_color': 'EBEBEB',
                    'profile_background_image_url':
                    'http://abs.twimg.com/images/themes/theme7/bg.gif',
                    'profile_background_image_url_https':
                    'https://abs.twimg.com/images/themes/theme7/bg.gif',
                    'profile_background_tile': False,
                    'profile_link_color': '990000',
                    'profile_sidebar_border_color': 'DFDFDF',
                    'profile_sidebar_fill_color': 'F3F3F3',
                    'profile_text_color': '333333',
                    'profile_use_background_image': True,
                    'profile_image_url':
                    'http://pbs.twimg.com/profile_images/823305825297006593/LhjPdILK_normal.jpg',
                    'profile_image_url_https':
                    'https://pbs.twimg.com/profile_images/823305825297006593/LhjPdILK_normal.jpg',
                    'profile_banner_url':
                    'https://pbs.twimg.com/profile_banners/29363354/1485126381',
                    'default_profile': False,
                    'default_profile_image': False,
                    'following': None,
                    'follow_request_sent': None,
                    'notifications': None
                },
                'geo': None,
                'coordinates': None,
                'place': {
                    'bounding_box': {
                        'coordinates': [[1, 2], [3, 2, 1]]
                    }
                },
                'contributors': None,
                'quoted_status_id': 936379603651883008,
                'quoted_status_id_str': '936379603651883008',
                'quoted_status': {
                    'created_at': 'Thu Nov 30 23:41:09 +0000 2017',
                    'id': 936379603651883008,
                    'id_str': '936379603651883008',
                    'text':
                    'On the left: @BarackObama’s National Tree Lighting\nOn the right: @realDonaldTrump’s National Tree Lighting… https://t.co/PcsatAL7Lu',
                    'display_text_range': [0, 140],
                    'source':
                    '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
                    'truncated': True,
                    'in_reply_to_status_id': None,
                    'in_reply_to_status_id_str': None,
                    'in_reply_to_user_id': None,
                    'in_reply_to_user_id_str': None,
                    'in_reply_to_screen_name': None,
                    'user': {
                        'id': 329433192,
                        'id_str': '329433192',
                        'name': 'Jeremy Dickey',
                        'screen_name': 'JeremyDDickey',
                        'location': 'Washington, D.C.',
                        'url': 'https://medium.com/@JeremyDDickey',
                        'description':
                        'City Government Media Specialist. Aspiring CJ Cregg. Graduate of @MercyhurstU & @LCCLondon. RTs = you got my attention. Tweets are my own. Sarcasm also my own.',
                        'translator_type': 'none',
                        'protected': False,
                        'verified': False,
                        'followers_count': 1860,
                        'friends_count': 2452,
                        'listed_count': 129,
                        'favourites_count': 5864,
                        'statuses_count': 64253,
                        'created_at': 'Tue Jul 05 02:20:11 +0000 2011',
                        'utc_offset': -18000,
                        'time_zone': 'Eastern Time (US & Canada)',
                        'geo_enabled': True,
                        'lang': 'en',
                        'contributors_enabled': False,
                        'is_translator': False,
                        'profile_background_color': '1A1B1F',
                        'profile_background_image_url':
                        'http://pbs.twimg.com/profile_background_images/474534472373649408/gaee5mbF.png',
                        'profile_background_image_url_https':
                        'https://pbs.twimg.com/profile_background_images/474534472373649408/gaee5mbF.png',
                        'profile_background_tile': False,
                        'profile_link_color': '3B94D9',
                        'profile_sidebar_border_color': 'FFFFFF',
                        'profile_sidebar_fill_color': '252429',
                        'profile_text_color': '666666',
                        'profile_use_background_image': False,
                        'profile_image_url':
                        'http://pbs.twimg.com/profile_images/932429063280627713/HnHFID4p_normal.jpg',
                        'profile_image_url_https':
                        'https://pbs.twimg.com/profile_images/932429063280627713/HnHFID4p_normal.jpg',
                        'profile_banner_url':
                        'https://pbs.twimg.com/profile_banners/329433192/1443752276',
                        'default_profile': False,
                        'default_profile_image': False,
                        'following': None,
                        'follow_request_sent': None,
                        'notifications': None
                    },
                    'geo': None,
                    'coordinates': None,
                    'place': {
                        'id': '6417871953fa5e86',
                        'url':
                        'https://api.twitter.com/1.1/geo/id/6417871953fa5e86.json',
                        'place_type': 'city',
                        'name': 'Silver Spring',
                        'full_name': 'Silver Spring, MD',
                        'country_code': 'US',
                        'country': 'United States',
                        'bounding_box': {
                            'type':
                            'Polygon',
                            'coordinates': [[[-77.064086, 38.979735],
                                             [-77.064086, 39.036964],
                                             [-76.97162, 39.036964],
                                             [-76.97162, 38.979735]]]
                        },
                        'attributes': {}
                    },
                    'contributors': None,
                    'is_quote_status': False,
                    'extended_tweet': {
                        'full_text':
                        'On the left: @BarackObama’s National Tree Lighting\nOn the right: @realDonaldTrump’s National Tree Lighting #Christmas https://t.co/wYoLJRO2r6',
                        'display_text_range': [0, 117],
                        'entities': {
                            'hashtags': [{
                                'text': 'Christmas',
                                'indices': [107, 117]
                            }],
                            'urls': [],
                            'user_mentions': [{
                                'screen_name': 'BarackObama',
                                'name': 'Barack Obama',
                                'id': 813286,
                                'id_str': '813286',
                                'indices': [13, 25]
                            }, {
                                'screen_name': 'realDonaldTrump',
                                'name': 'Donald J. Trump',
                                'id': 25073877,
                                'id_str': '25073877',
                                'indices': [65, 81]
                            }],
                            'symbols': [],
                            'media': [{
                                'id': 936379576682450944,
                                'id_str': '936379576682450944',
                                'indices': [118, 141],
                                'media_url':
                                'http://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg',
                                'media_url_https':
                                'https://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg',
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'expanded_url':
                                'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1',
                                'type': 'photo',
                                'sizes': {
                                    'medium': {
                                        'w': 1200,
                                        'h': 800,
                                        'resize': 'fit'
                                    },
                                    'small': {
                                        'w': 680,
                                        'h': 453,
                                        'resize': 'fit'
                                    },
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    },
                                    'large': {
                                        'w': 1752,
                                        'h': 1168,
                                        'resize': 'fit'
                                    }
                                }
                            }, {
                                'id': 936379575839358977,
                                'id_str': '936379575839358977',
                                'indices': [118, 141],
                                'media_url':
                                'http://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg',
                                'media_url_https':
                                'https://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg',
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'expanded_url':
                                'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1',
                                'type': 'photo',
                                'sizes': {
                                    'small': {
                                        'w': 680,
                                        'h': 680,
                                        'resize': 'fit'
                                    },
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    },
                                    'medium': {
                                        'w': 1200,
                                        'h': 1200,
                                        'resize': 'fit'
                                    },
                                    'large': {
                                        'w': 2048,
                                        'h': 2048,
                                        'resize': 'fit'
                                    }
                                }
                            }]
                        },
                        'extended_entities': {
                            'media': [{
                                'id': 936379576682450944,
                                'id_str': '936379576682450944',
                                'indices': [118, 141],
                                'media_url':
                                'http://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg',
                                'media_url_https':
                                'https://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg',
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'expanded_url':
                                'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1',
                                'type': 'photo',
                                'sizes': {
                                    'medium': {
                                        'w': 1200,
                                        'h': 800,
                                        'resize': 'fit'
                                    },
                                    'small': {
                                        'w': 680,
                                        'h': 453,
                                        'resize': 'fit'
                                    },
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    },
                                    'large': {
                                        'w': 1752,
                                        'h': 1168,
                                        'resize': 'fit'
                                    }
                                }
                            }, {
                                'id': 936379575839358977,
                                'id_str': '936379575839358977',
                                'indices': [118, 141],
                                'media_url':
                                'http://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg',
                                'media_url_https':
                                'https://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg',
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'expanded_url':
                                'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1',
                                'type': 'photo',
                                'sizes': {
                                    'small': {
                                        'w': 680,
                                        'h': 680,
                                        'resize': 'fit'
                                    },
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    },
                                    'medium': {
                                        'w': 1200,
                                        'h': 1200,
                                        'resize': 'fit'
                                    },
                                    'large': {
                                        'w': 2048,
                                        'h': 2048,
                                        'resize': 'fit'
                                    }
                                }
                            }]
                        }
                    },
                    'quote_count': 56,
                    'reply_count': 44,
                    'retweet_count': 326,
                    'favorite_count': 385,
                    'entities': {
                        'hashtags': [],
                        'urls': [{
                            'url': 'https://t.co/PcsatAL7Lu',
                            'expanded_url':
                            'https://twitter.com/i/web/status/936379603651883008',
                            'display_url': 'twitter.com/i/web/status/9…',
                            'indices': [108, 131]
                        }],
                        'user_mentions': [{
                            'screen_name': 'BarackObama',
                            'name': 'Barack Obama',
                            'id': 813286,
                            'id_str': '813286',
                            'indices': [13, 25]
                        }, {
                            'screen_name': 'realDonaldTrump',
                            'name': 'Donald J. Trump',
                            'id': 25073877,
                            'id_str': '25073877',
                            'indices': [65, 81]
                        }],
                        'symbols': []
                    },
                    'favorited': False,
                    'retweeted': False,
                    'possibly_sensitive': False,
                    'filter_level': 'low',
                    'lang': 'en'
                },
                'is_quote_status': True,
                'quote_count': 0,
                'reply_count': 0,
                'retweet_count': 0,
                'favorite_count': 0,
                'entities': {
                    'hashtags': [],
                    'urls': [{
                        'url': 'https://t.co/0BW86RBIRH',
                        'expanded_url':
                        'https://twitter.com/jeremyddickey/status/936379603651883008',
                        'display_url': 'twitter.com/jeremyddickey/…',
                        'indices': [17, 40]
                    }],
                    'user_mentions': [{
                        'screen_name': 'realDonaldTrump',
                        'name': 'Donald J. Trump',
                        'id': 25073877,
                        'id_str': '25073877',
                        'indices': [0, 16]
                    }],
                    'symbols': []
                },
                'favorited': False,
                'retweeted': False,
                'possibly_sensitive': False,
                'filter_level': 'low',
                'lang': 'und',
                'timestamp_ms': '1512093225971'
            })

        self._status_backup = deepcopy(self._status)
示例#47
0
def bulk_load(listkey, tweets):
    with open("/home/marcua/data/tweets/%s" % (listkey), "w") as tmpfile:
        print "file %s" % (tmpfile.name)
        for jsontweet in tweets:
            tweet = Status.parse(api, json.loads(jsontweet))
            tmpfile.write(convert_to_utf8_str(tweet.text) + "\n")
示例#48
0
 def load_status():
     with open('./tests/cassettes/sample-tweet.json') as infile:
         status = Status.parse(api=None, json=load(infile))
         return status
示例#49
0
 hashtag = 0
 url = 0
 question = 0
 exclamation = 0
 pos_term = 0
 neg_term = 0
 pos_emoticon = 0
 neg_emoticon = 0
 reply = 0
 moment_morning = 0
 moment_afternoon = 0
 moment_evening = 0
 moment_night = 0
 retweeted = 0
 
 status = Status.parse(api, json.loads(tweet[0]))
 
 if status.id in error_list_tweet_ids:
     tweets_discarded_error += 1
 elif status.text.startswith("RT @"):
     tweets_discarded_retweet += 1     
 else:
     tweets_considered += 1
     if regex_username.search(status.text) != None:
         tweets_username += 1
         username = 1
     if regex_hashtag.search(status.text) != None:
         tweets_hashtag += 1
         hashtag = 1
     if regex_url.search(status.text) != None:
         tweets_url += 1
示例#50
0
 hashtag = 0
 url = 0
 question = 0
 exclamation = 0
 pos_term = 0
 neg_term = 0
 pos_emoticon = 0
 neg_emoticon = 0
 reply = 0
 moment_morning = 0
 moment_afternoon = 0
 moment_evening = 0
 moment_night = 0
 retweeted = 0
 
 status = Status.parse(api, tweet)
 
 if tweet['id'] in error_list_tweet_ids:
     tweets_discarded_error += 1
 elif tweet['text'].startswith("RT @"):
     tweets_discarded_retweet += 1     
 else:
     tweets_considered += 1
     if regex_username.search(tweet['text']) != None:
         tweets_username += 1
         username = 1
     if regex_hashtag.search(tweet['text']) != None:
         tweets_hashtag += 1
         hashtag = 1
     if regex_url.search(tweet['text']) != None:
         tweets_url += 1
示例#51
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.

        """
        data = json.loads(raw_data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False

        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False

        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False

        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False

        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False

        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False

        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False

        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False

        else:
            return False

        # If this tweet contains text.
        if "user" in list(data.keys()):

            # --------------------------------------------------------------- #
            # Stupid print for fun.
            uname = data["user"]["screen_name"]
            umsg = data["text"]
            nspc = (20 - len(uname))
            if nspc < 1:
                nspc = 1
            spc = " " * nspc
            if not umsg.startswith("RT"):
                print("<tweet>", uname, spc, umsg.replace("\n", ""))
            # --------------------------------------------------------------- #

            # Write the tweet to the buffer.
            self.buffer.write(raw_data)

            # Running counter.
            self.count += 1

            # If the buffer is full, then cycle the buffer.
            if self.count % self.save_interval == 0:
                self.swap_buffer()

            # If the counter is a check-in interval, do all the check-in tasks.
            if self.count % check_in_interval == 0:

                # Shutdown if the `runtime` `run` value is False.
                if checkin_killstream():
                    return False

                # pause if there are too many files in the new tweet directory.
                if not checkin_pausestream():
                    return False
示例#52
0
def bulk_load(listkey, tweets):
    with open('/home/marcua/data/tweets/%s' % (listkey), 'w') as tmpfile:
        print "file %s" % (tmpfile.name)
        for jsontweet in tweets:
            tweet = Status.parse(api, json.loads(jsontweet))
            tmpfile.write(convert_to_utf8_str(tweet.text) + "\n")
    def on_data(self, data):                
        '''Parse raw data from twitter and pass the status object to on_status()
        
        Call when raw data is passed from twitter.        
        If this function return False, it stop listening to the streamining.
        
        gSave_raw_json: if true, write json raw text to the ../json/
                        Set it to true only if you would like to debug.
                        
                           
        '''
        
        try:
            self.on_data_running = True
            self.log("Get raw data from Twitter", screen_only=True)
            
            if gSave_raw_json:
                ### save the json into disk ###
                parsed_data = tweepy.utils.import_simplejson().loads(data)
                
                if "id" not in parsed_data.keys():  #may return {"limit":{"track":73}} or {delete...}, ignore this data
                    return True #chucheng, this line is equal to check if 'delete'/;limit' in data
                    
                folder_name = parsed_data["id"]%1000
                
                
                try:
                    if not os.path.exists("../json/"+str(folder_name)):
                        os.makedirs("../json/"+str(folder_name))
                except OSError as ose:
                    self.log("OS ERROR")
                    pass
                
                filename = "../json/"+str(folder_name) + "/" + str(parsed_data["id"]) + ".json" 
                #print filename # for debug
                output = open(filename,"w")
                output.write(data)
                output.write('\n')
                output.close()
                ### done ###
                        
            # Chucheng 4/25/2011:
            #   We must override the method, because the original one might             
            #   return false, cause a stop of the listerner.
            #   In short, you cannot simply call:
            #       tweepy.StreamListener.on_data(self, data) 
            if 'in_reply_to_status_id' in data:
                status = Status.parse(self.api, json.loads(data))
                if self.on_status(status) is False: #Trigger on_status now!!
                    self.log('in_reply_to_status_id in data: on_status() returns False. (this line should never be reached)')
            else:
                pass #do nothing, the data we get is not what we need.
                    
            """ These lines should never be triggered in that we check :
                
            
            elif 'delete' in data:
                delete = json.loads(data)['delete']['status']
                if self.on_delete(delete['id'], delete['user_id']) is False:
                    self.log('delete in data: a delete notice arrives for a status')
            elif 'limit' in data:
                if self.on_limit(json.loads(data)['limit']['track']) is False:
                    self.log('limit in data: a limitation notice arrvies')       
            """
            
            self.on_data_running = False # This variable signal whether 
                                         # we are in the middle of processing data.
        
            if self.running == False: # see: StreamingCrawler.stop_listner()
                return False #stop the listener while catching a SIGTERM
            
        except Exception as e:
            self.on_data_running = False            
            self.log("Error:" + str(e), sys.exc_traceback)

        return True