예제 #1
0
def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
  """Collects snarks from your Twitter screen name and @mentions.
  This is much more reliable than a plain search, but it only
  works for your own account.

  This parser adds non-standard attributes to snarks:
  "user_url" and "msg_url", links to the user's twitter
  page and to the specific tweet. Exporters might
  disregard this info.

  :param src_path: Not used.
  :param first_msg: If not None, ignore comments until this substring is found.
  :param options: A dict of extra options specific to this parser.
                  since_date (optional):
                      UTC Datetime to limit dredging up old tweets.
                  until_date (optional):
                      UTC Datetime to limit dredging up new tweets.
  :param keep_alive_func: Optional replacement to get an abort boolean.
  :param sleep_func: Optional replacement to sleep N seconds.
  :return: A List of snark dicts.
  :raises: ParserError
  """
  if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive
  if (sleep_func is None): sleep_func = global_config.nap

  since_date = None
  if (ns+"since_date" in options and options[ns+"since_date"]):
    since_date = options[ns+"since_date"]

  until_date = None
  if (ns+"until_date" in options and options[ns+"until_date"]):
    until_date = options[ns+"until_date"]

  snarks = []

  tweepy = tweepy_backend.get_tweepy()
  tweepy_api = tweepy_backend.get_api()

  try:
    my_screen_name = tweepy_api.auth.get_username()

    # List of pattern/replacement tuples to strip reply topic from comments.
    reply_name_escaped = re.escape(my_screen_name)
    reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
                     (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]

    mention_args = {"count":200, "include_entities":"false", "include_rts":"false"}
    mention_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"statuses", "res_name":"/statuses/mentions_timeline"}
    timeline_args = {"count":200, "include_entities":"false", "include_rts":"false"}
    timeline_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"statuses", "res_name":"/statuses/user_timeline"}

    searches = []
    searches.append(("Mentions", tweepy_api.mentions_timeline, mention_args, 800, mention_rate))
    searches.append(("Timeline", tweepy_api.user_timeline, timeline_args, 3200, timeline_rate))

    def update_rate_info():
      # Sets new rate info values for the searches.
      rate_status = tweepy_api.rate_limit_status()
      for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
        rate_info.update(rate_status["resources"][rate_info["res_family"]][rate_info["res_name"]])

    update_rate_info()

    for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
      done = False
      query_count = 0
      results_count = 0
      last_max_id = None

      while (keep_alive_func() and done is False and results_count < search_cap and rate_info["remaining"] > 0):
        results = tweepy_func(**tweepy_func_args)
        rate_info["remaining"] -= 1
        if (not results):
          done = True
          break
        else:
          query_count += 1
          results_count += len(results)
          logging.info("%s Query % 2d: % 3d results." % (search_type, query_count, len(results)))

          last_status_id = None
          for status in results:
            if (last_max_id == status.id): continue
            last_status_id = status.id

            snark = {}
            snark["user"] = "******" % common.asciify(status.author.screen_name)
            snark["msg"] = status.text
            for (reply_ptn, reply_rep) in reply_regexes:
              snark["msg"] =  reply_ptn.sub(reply_rep, snark["msg"])
            snark["msg"] = common.asciify(common.html_unescape(snark["msg"]))

            snark["date"] = status.created_at

            snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(status.author.screen_name)
            snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(status.author.screen_name), status.id)

            if (until_date and snark["date"] > until_date):
              continue  # This snark is too recent.

            if (since_date and snark["date"] < since_date):
              done = True  # This snark is too early.
              break

            snarks.append(snark)

            if (first_msg):
              if (snark["msg"].find(first_msg) != -1):
                done = True  # Found the first comment.
                break

          if (last_status_id is not None):
            # Dig deeper into the past on the next loop.
            tweepy_func_args["max_id"] = last_status_id
            last_max_id = last_status_id
          else:
            # Must've only gotten the "max_id" tweet again.
            done = True
            break

          if (rate_info["reset"] is not None and time.time() >= float(rate_info["reset"])):
            update_rate_info()

            reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S")
            logging.info("API limit for '%s' reset. Calls left: %d (Until %s)" % (rate_info["res_name"], rate_info["remaining"], reset_string))

      if (done is False and rate_info["remaining"] <= 0):
        logging.warning("Twitter API rate limit truncated results for '%s'." % rate_info["res_name"])
        break  # No more searches.

    update_rate_info()
    logging.info("Twitter API calls left...")
    for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
      reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S")
      logging.info("'%s': %d (Until %s)." % (rate_info["res_name"], rate_info["remaining"], reset_string))
    logging.info("Current Time: %s" % datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

  except (Exception) as err:
    logging.exception("Parser failed.")
    raise common.ParserError("Parser failed.")

  snarks = sorted(snarks, key=lambda k: k["date"])

  # Drop duplicates from multiple passes.
  snarks = uniquify_list(snarks)

  if (first_msg):
    first_index = -1
    for i in range(len(snarks)):
      if (snarks[i]["msg"].find(first_msg) != -1):
        # Finally reached the expected first msg.
        first_index = i
    if (first_index >= 0):
      snarks = snarks[first_index:]
    else:
      logging.warning("first_msg string \"%s\" was not found." % first_msg)
      snarks = []

  return snarks
예제 #2
0
  def run(self):
    try:
      while (self.keep_alive):
        self.nap(self._sleep_interval)
        if (not self.keep_alive): break

        while(self.keep_alive):
          # Keep popping until a valid unexpired tweet is found.
          line = self._stream_lines.pop_line()
          if (line is None): break
          if (len(line) == 0): continue

          tweet = None
          try:
            tweet = json.loads(line)
          except (TypeError, ValueError) as err:
            logging.info("Tweet parsing failed: %s" % repr(line))
            continue

          user_clean = None
          text_clean = None
          tweet_time = 0
          user_is_ignored = False
          if ("user" in tweet and "screen_name" in tweet["user"]):
            user_clean = common.asciify(tweet["user"]["screen_name"])
            with self._options_lock:
              if (user_clean in self._ignored_users):
                user_is_ignored = True

          if ("text" in tweet):
            text_clean = common.asciify(common.html_unescape(tweet["text"]))
            text_clean = re.sub("\r", "", text_clean)
            text_clean = re.sub("^ +", "", text_clean)
            text_clean = re.sub("^@[^ ]+ *", "", text_clean, 1)
            text_clean = re.sub(" *https?://[^ ]+", "", text_clean)
            text_clean = text_clean.rstrip(" \n")
            if (re.match("^[? .\n\"]{8,}$", text_clean)):
              continue  # Likely tons of non-ascii chars. Skip.

          if ("created_at" in tweet):
            tweet_time = datetime.strptime(tweet["created_at"] +" UTC", '%a %b %d %H:%M:%S +0000 %Y %Z')

          if (user_clean and text_clean and tweet_time):
            current_time = datetime.utcnow()
            lag_delta = (current_time - tweet_time)
            lag_str = ""
            if (abs(lag_delta) == lag_delta):  # Tweet in past, positive lag.
              lag_str = "%ds" % lag_delta.seconds
            elif (lag_delta.days == -1 and (tweet_time - current_time).seconds == 0):
              lag_str = "0s"                   # Tweet was only microseconds ahead, call it 0.
            else:                              # Tweet in future, negative lag (-1 day, 86400-Nsecs).
              lag_str = "-%ds" % (tweet_time - current_time).seconds

            if (lag_delta > self._expire_delta):
              logging.info("Tweet expired (lag %s): %s: %s" % (lag_str, user_clean, text_clean))
              continue
            elif (user_is_ignored):
              logging.info("Tweet ignored (lag %s): %s: %s" % (lag_str, user_clean, text_clean))
              continue
            else:
              logging.info("Tweet shown (lag %s): %s: %s" % (lag_str, user_clean, text_clean))
              self._show_message(user_clean, text_clean, tweet)
              break
            #logging.info("Time(Current): %s  Time(Tweet): %s" % (current_time.strftime("%a %b %d %Y %H:%M:%S"), tweet_time.strftime("%a %b %d %Y %H:%M:%S")))
            #logging.info("---")

    except (Exception) as err:
      logging.exception("Unexpected exception in %s." % self.__class__.__name__)  #raise
      self.keep_alive = False
예제 #3
0
def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
  """Collects snarks from an html Transcript post on LousyCanuck's blog.
  See: http://twitter.com/MockTM
  See: http://freethoughtblogs.com/lousycanuck/

  This parser adds non-standard attributes to snarks:
  "user_url" and "msg_url", links to the user's twitter
  page and to the specific tweet. Exporters might
  disregard this info.

  :param src_path: A url, or saved html source.
  :param first_msg: If not None, ignore comments until this substring is found.
  :param options: A dict of extra options specific to this parser.
                  reply_name (optional):
                      The name to which replies were directed.
                      Regexes will remove it from comments.
  :param keep_alive_func: Optional replacement to get an abort boolean.
  :param sleep_func: Optional replacement to sleep N seconds.
  :return: A List of snark dicts.
  :raises: ParserError
  """
  if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive
  if (sleep_func is None): sleep_func = global_config.nap

  if (not src_path): raise common.ParserError("The %s parser requires the general arg, \"src_path\", to be set." % re.sub(".*[.]", "", __name__))

  # Regex to parse tweet info out of html.
  snark_ptn = re.compile("(?:<p>)?<a href='([^']*)'>([^<]*)</a>: (.*?) +<br ?/><font size=-3><a href='([^']*)'[^>]*>([0-9]{4})-([0-9]{2})-([0-9]{2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})</a></font>(?:<br ?/>|</p>)?", re.IGNORECASE)

  # List of pattern/replacement tuples to strip reply topic from comments.
  reply_regexes = []
  if (ns+"reply_name" in options and options[ns+"reply_name"]):
    reply_name_escaped = re.escape(options[ns+"reply_name"])
    reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
                     (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]

  # Regex to know when to stop parsing.
  tail_ptn = re.compile("<div class=\"[^\"]*robots-nocontent[^\"]*\">")

  start_date = None
  snarks = []

  lines = []
  try:
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 5.1; rv:27.0) Gecko/20100101 Firefox/27.0"}
    req = urllib2.Request(src_path, None, headers=headers)
    with contextlib.closing(urllib2.urlopen(req)) as snark_file:
      while (keep_alive_func()):
        line = snark_file.readline()
        if (line == ''): break
        line = re.sub("\r\n?", "\n", line)  # Local files are opened without universal newlines.
        line = line[:-1]
        lines.append(line)
  except (urllib2.HTTPError) as err:
    logging.error("Http status: %d" % err.code)
    raise common.ParserError("Parser failed.")
  except (urllib2.URLError) as err:
    logging.error(str(err))
    raise common.ParserError("Parser failed.")

  for line in lines:
    if (tail_ptn.search(line) is not None): break

    result = snark_ptn.match(line)
    if (result is None):
      # Only complain once the first snark is found.
      if (start_date is not None): logging.warning("Bad Line: "+ line)
      continue

    snark = {}
    snark["user"] = result.group(2)
    snark["msg"] =  result.group(3)
    for reply_ptn, reply_rep in reply_regexes:
      snark["msg"] =  reply_ptn.sub(reply_rep, snark["msg"])
    snark["msg"] =  common.asciify(common.html_unescape(snark["msg"]))

    year, month, day = [int(result.group(i)) for i in [5,6,7]]
    hour, minute, second = [int(result.group(i)) for i in [8,9,10]]

    # UTC time zone?
    snark["date"] = datetime(year, month, day, hour, minute, second)

    snark["user_url"] = result.group(1)
    snark["msg_url"] = result.group(4)

    if (start_date is None):
      if (first_msg and line.find(first_msg) == -1):
        # This snark was earlier than the expected first msg.
        continue
      start_date = snark["date"]

    snarks.append(snark)

  return snarks
예제 #4
0
def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
  """Collects snarks from a Twitter search. Finds
  tweets from any account and @reply mentions of it.
  See: https://dev.twitter.com/docs/api/1/get/search

  This parser adds non-standard attributes to snarks:
  "user_url" and "msg_url", links to the user's twitter
  page and to the specific tweet. Exporters might
  disregard this info.

  Twitter's search API only reaches back a few days
  and may be incomplete. :/

  :param src_path: Not used.
  :param first_msg: If not None, ignore comments prior to one containing this substring.
  :param options: A dict of extra options specific to this parser.
                  reply_name:
                      The name to which replies were directed (no "@").
                  since_date (optional):
                      UTC Datetime to limit dredging up old tweets.
                  until_date (optional):
                      UTC Datetime to limit dredging up new tweets.
  :param keep_alive_func: Optional replacement to get an abort boolean.
  :param sleep_func: Optional replacement to sleep N seconds.
  :return: A List of snark dicts.
  :raises: ParserError
  """
  if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive
  if (sleep_func is None): sleep_func = global_config.nap

  since_date = None
  if (ns+"since_date" in options and options[ns+"since_date"]):
    since_date = options[ns+"since_date"]

  until_date = None
  if (ns+"until_date" in options and options[ns+"until_date"]):
    until_date = options[ns+"until_date"]

  missing_options = [o for o in ["reply_name"] if ((ns+o) not in options or not options[ns+o])]
  if (len(missing_options) > 0):
    logging.error("Required parser options weren't provided: %s." % ", ".join(missing_options))
    raise common.ParserError("Parser failed.")

  snarks = []

  tweepy = tweepy_backend.get_tweepy()
  tweepy_api = tweepy_backend.get_api()

  try:
    # List of pattern/replacement tuples to strip reply topic from comments.
    reply_name_escaped = re.escape(options[ns+"reply_name"])
    reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
                     (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]

    search_args = {"rpp":100, "include_entities":"false", "result_type":"recent"}
    search_args["q"] = "@%s OR from:%s" % (options[ns+"reply_name"], options[ns+"reply_name"])
    if (since_date): search_args["since"] = since_date.strftime("%Y-%m-%d")
    if (until_date): search_args["until"] = until_date.strftime("%Y-%m-%d")
    search_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"search", "res_name":"/search/tweets"}

    searches = []
    searches.append(("Search", tweepy_api.search, search_args, 1500, search_rate))

    def update_rate_info():
      # Sets new rate info values for the searches.
      rate_status = tweepy_api.rate_limit_status()
      for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
        rate_info.update(rate_status["resources"][rate_info["res_family"]][rate_info["res_name"]])

    update_rate_info()

    for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
      done = False
      query_count = 0
      results_count = 0
      last_max_id = None

      while (keep_alive_func() and done is False and results_count < search_cap and rate_info["remaining"] > 0):
        results = tweepy_func(**tweepy_func_args)
        rate_info["remaining"] -= 1
        if (not results):
          done = True
          break
        else:
          query_count += 1
          results_count += len(results)
          logging.info("%s Query % 2d: % 3d results." % (search_type, query_count, len(results)))

          last_id = None
          for search_result in results:
            if (last_max_id == search_result.id): continue
            last_id = search_result.id

            snark = {}
            snark["user"] = "******" % common.asciify(search_result.from_user)
            snark["msg"] = search_result.text
            for (reply_ptn, reply_rep) in reply_regexes:
              snark["msg"] =  reply_ptn.sub(reply_rep, snark["msg"])
            snark["msg"] = common.asciify(common.html_unescape(snark["msg"]))

            snark["date"] = search_result.created_at

            snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(search_result.from_user)
            snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(search_result.from_user), search_result.id)

            if (until_date and snark["date"] > until_date):
              continue  # This snark is too recent.

            if (since_date and snark["date"] < since_date):
              done = True  # This snark is too early.
              break

            snarks.append(snark)

            if (first_msg):
              if (snark["msg"].find(first_msg) != -1):
                done = True  # Found the first comment.
                break

          if (last_id is not None):
            # Dig deeper into the past on the next loop.
            tweepy_func_args["max_id"] = last_id
            last_max_id = last_id
          else:
            # Must've only gotten the "max_id" tweet again.
            done = True
            break

          if (rate_info["reset"] is not None and time.time() >= float(rate_info["reset"])):
            update_rate_info()

            reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S")
            logging.info("API limit for '%s' reset. Calls left: %d (Until %s)" % (rate_info["res_name"], rate_info["remaining"], reset_string))

      if (done is False and rate_info["remaining"] <= 0):
        logging.warning("Twitter API rate limit truncated results for '%s'." % rate_info["res_name"])
        break  # No more searches.

    update_rate_info()
    logging.info("Twitter API calls left...")
    for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
      reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S")
      logging.info("'%s': %d (Until %s)." % (rate_info["res_name"], rate_info["remaining"], reset_string))
    logging.info("Current Time: %s" % datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

  except (Exception) as err:
    logging.exception("Parser failed.")
    raise common.ParserError("Parser failed.")

  snarks = sorted(snarks, key=lambda k: k["date"])

  # Drop duplicates from multiple passes.
  snarks = uniquify_list(snarks)

  if (first_msg):
    first_index = -1
    for i in range(len(snarks)):
      if (snarks[i]["msg"].find(first_msg) != -1):
        # Finally reached the expected first msg.
        first_index = i
    if (first_index >= 0):
      snarks = snarks[first_index:]
    else:
      logging.warning("first_msg string \"%s\" was not found." % first_msg)
      snarks = []

  return snarks