Пример #1
0
def _ingest_event(stream_event, session, twitter_api, tweet_queue, user_queue):
    """Helper function which does the individual inserts.

  Used to factor inserts like retweets and quotes which may contain their substructures directly,
  and thus avoid needing to become queued and get processed from a work queue.

  """

    log.debug(json.dumps(stream_event))

    if stream_event and stream_event.get("event"):
        # This is the case of a non-message stream_event

        # FIXME: see other interesting cases here:
        # https://dev.twitter.com/streaming/overview/messages-types

        if stream_event.get("source"):
            ingest_user_object(
                User.NewFromJsonDict(stream_event.get("source")), session)

        if stream_event.get("target"):
            ingest_user_object(
                User.NewFromJsonDict(stream_event.get("target")), session)

        if stream_event.get("event") in [
                "favorite", "unfavorite", "quoted_tweet"
        ]:
            # We're ingesting a tweet here
            _ingest_event(stream_event.get("target_object"), session,
                          twitter_api, tweet_queue, user_queue)

    elif stream_event.get("delete"):
        # For compliance with the developer rules.
        # Sadly.
        event_id = stream_event.get("delete").get("status").get("id")
        log.warn("Honoring delete %s", event_id)
        entity = bt._tweet_or_dummy(session, event_id)
        entity.tombstone = True
        session.add(entity)
        session.commit()

    elif stream_event and "id" in stream_event and "user" in stream_event:
        if "extended_tweet" in stream_event:
            # This is the case of having gotten a new "extended" tweet.
            # Need to munge the text representation.
            #
            # And by munge I just mean copy, because the twitter-python driver drops this on the floor
            stream_event["text"] = stream_event["extended_tweet"]["full_text"]

        ingest_tweet(stream_event, session, twitter_api, tweet_queue)

    elif "friends" in stream_event:
        for friend in stream_event.get("friends"):
            user_queue.put(str(friend))

    else:
        blob = json.dumps(stream_event)
        with open("mystery.log", "a") as f:
            f.write(blob + "\n")
        log.warn(blob)
Пример #2
0
def get_user(user_id: int) -> User:
    cache = get_cache("user", user_id)
    retries = 0
    if cache is None:
        print("Making network request for user id %s" % user_id)
        while True:
            try:
                data = api.GetUser(user_id)
                break
            except ConnectionError:
                retries += 1
                if retries > 1:
                    raise
        cache = write_cache("user", data)
    return User.NewFromJsonDict(cache)
Пример #3
0
def ingest_users(twitter_api, session, users):
  twitter_user = None
  line = None
  for screen_name in users:
    try:
      twitter_user = twitter_api.GetUser(screen_name=screen_name)
      if isinstance(twitter_user, dict):
        twitter_user = User.NewFromJsonDict(twitter_user)

      persona = personas.personas_by_name(session, line, one=True, exact=True)
      print(bt.insert_user(session, twitter_user, persona))
    except TwitterError as e:
      print(line, line, e)
    except AssertionError as e:
      print(line, twitter_user, e)
      raise e
Пример #4
0
def ingest_tweet(tweet, session, twitter_api, tweet_id_queue):
    """Actually ingest a single tweet, dealing with the required enqueuing."""

    if not isinstance(tweet, Status):
        tweet = Status.NewFromJsonDict(tweet)

    if tweet.retweeted_status:
        # We don't actually care about retweets, they aren't original content.
        # Just insert the original.
        ingest_tweet(tweet.retweeted_status, session, twitter_api,
                     tweet_id_queue)

        ingest_user_object(tweet.user, session)

    else:
        flag = have_tweet(session, tweet.id)
        t = bt.insert_tweet(session, twitter_api, tweet)
        if not flag:
            log.info(t)

        if tweet.in_reply_to_status_id:
            # This tweet is a reply. It links to some other tweet. Or possibly tweets depending on the
            # link content which may link many statuses. However Twitter only considers one status to
            # be the "reply" target. Create a "reply_to" relationship for the post we're inserting by
            # inserting its parent post(s) (recursively!)
            thread_id = str(tweet.in_reply_to_status_id)
            if not have_tweet(session, thread_id):
                tweet_id_queue.put(thread_id)
                pass

        if tweet.quoted_status:
            # This is a quote tweet (possibly subtweet or snarky reply, quote tweets have different
            # broadcast mechanics).
            ingest_tweet(tweet.quoted_status, session, twitter_api,
                         tweet_id_queue)

        for url in tweet.urls or []:
            tweet_id = bt.tweet_id_from_url(url.expanded_url)
            if tweet_id and not have_tweet(session, tweet_id):
                tweet_id_queue.put(tweet_id)
                pass

        for user in tweet.user_mentions or []:
            if not isinstance(user, User):
                user = User.NewFromJsonDict(user)
            ingest_user_object(user, session)
Пример #5
0
def insert_tweet(session, twitter_api, tweet):
    """Insert a tweet (status using the old API terminology) into the backing datastore.

  This means inserting the original poster, inserting the service, inserting the post and inserting
  the post distribution.

  WARNING: this function does NOT recursively insert replied to tweets, or quoted tweets. It's
  expected that some other system handles walking the tree of tweets to deal with all that. This is,
  ultimately, to work around the garbage Twitter rate limits.

  """

    _tw = insert_twitter(session)
    try:
        poster = tweet.user
        if not isinstance(poster, User):
            poster = User.NewFromJsonDict(poster)
        poster = insert_user(session, poster)
        assert isinstance(poster, Account)
    except AssertionError as e:
        print("Encountered exception", repr(e), traceback.format_exc(),
              "Processing tweet", tweet)
        return None

    dupe = session.query(Post)\
                  .filter_by(external_id=twitter_external_tweet_id(tweet.id))\
                  .first()
    # There's a dummy record in place, flesh it out. We're in a monoid here.
    if dupe:
        dupe.poster = poster
        dupe.when = aget(
            datetime.strptime(tweet.created_at, _tw_datetime_pattern))
        dupe.text = _get_tweet_text(tweet)
        dupe.more = tweet.AsDict()
        session.add(dupe)
        session.commit()
        return dupe

    # We're inserting a new tweet here...
    else:
        post = Post(service=_tw,
                    text=_get_tweet_text(tweet),
                    external_id=twitter_external_tweet_id(tweet.id_str),
                    poster=poster,
                    when=aget(
                        datetime.strptime(tweet.created_at,
                                          _tw_datetime_pattern)),
                    more=tweet.AsDict())
        session.add(post)

        for user in tweet.user_mentions:
            get_or_create(session,
                          PostDistribution,
                          post=post,
                          recipient=insert_user(session, User(id=user.id)),
                          rel="to")

        if tweet.in_reply_to_status_id:
            get_or_create(session,
                          PostRelationship,
                          left=post,
                          right=_tweet_or_dummy(session,
                                                tweet.in_reply_to_status_id),
                          rel="reply-to")

        if tweet.quoted_status_id:
            get_or_create(session,
                          PostRelationship,
                          left=post,
                          right=_tweet_or_dummy(session,
                                                tweet.quoted_status_id),
                          rel="quotes")

        session.commit()

        return post