def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None): """Collects snarks from your Twitter screen name and @mentions. This is much more reliable than a plain search, but it only works for your own account. This parser adds non-standard attributes to snarks: "user_url" and "msg_url", links to the user's twitter page and to the specific tweet. Exporters might disregard this info. :param src_path: Not used. :param first_msg: If not None, ignore comments until this substring is found. :param options: A dict of extra options specific to this parser. since_date (optional): UTC Datetime to limit dredging up old tweets. until_date (optional): UTC Datetime to limit dredging up new tweets. :param keep_alive_func: Optional replacement to get an abort boolean. :param sleep_func: Optional replacement to sleep N seconds. :return: A List of snark dicts. :raises: ParserError """ if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive if (sleep_func is None): sleep_func = global_config.nap since_date = None if (ns+"since_date" in options and options[ns+"since_date"]): since_date = options[ns+"since_date"] until_date = None if (ns+"until_date" in options and options[ns+"until_date"]): until_date = options[ns+"until_date"] snarks = [] tweepy = tweepy_backend.get_tweepy() tweepy_api = tweepy_backend.get_api() try: my_screen_name = tweepy_api.auth.get_username() # List of pattern/replacement tuples to strip reply topic from comments. reply_name_escaped = re.escape(my_screen_name) reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "), (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")] mention_args = {"count":200, "include_entities":"false", "include_rts":"false"} mention_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"statuses", "res_name":"/statuses/mentions_timeline"} timeline_args = {"count":200, "include_entities":"false", "include_rts":"false"} timeline_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"statuses", "res_name":"/statuses/user_timeline"} searches = [] searches.append(("Mentions", tweepy_api.mentions_timeline, mention_args, 800, mention_rate)) searches.append(("Timeline", tweepy_api.user_timeline, timeline_args, 3200, timeline_rate)) def update_rate_info(): # Sets new rate info values for the searches. rate_status = tweepy_api.rate_limit_status() for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches: rate_info.update(rate_status["resources"][rate_info["res_family"]][rate_info["res_name"]]) update_rate_info() for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches: done = False query_count = 0 results_count = 0 last_max_id = None while (keep_alive_func() and done is False and results_count < search_cap and rate_info["remaining"] > 0): results = tweepy_func(**tweepy_func_args) rate_info["remaining"] -= 1 if (not results): done = True break else: query_count += 1 results_count += len(results) logging.info("%s Query % 2d: % 3d results." % (search_type, query_count, len(results))) last_status_id = None for status in results: if (last_max_id == status.id): continue last_status_id = status.id snark = {} snark["user"] = "******" % common.asciify(status.author.screen_name) snark["msg"] = status.text for (reply_ptn, reply_rep) in reply_regexes: snark["msg"] = reply_ptn.sub(reply_rep, snark["msg"]) snark["msg"] = common.asciify(common.html_unescape(snark["msg"])) snark["date"] = status.created_at snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(status.author.screen_name) snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(status.author.screen_name), status.id) if (until_date and snark["date"] > until_date): continue # This snark is too recent. if (since_date and snark["date"] < since_date): done = True # This snark is too early. break snarks.append(snark) if (first_msg): if (snark["msg"].find(first_msg) != -1): done = True # Found the first comment. break if (last_status_id is not None): # Dig deeper into the past on the next loop. tweepy_func_args["max_id"] = last_status_id last_max_id = last_status_id else: # Must've only gotten the "max_id" tweet again. done = True break if (rate_info["reset"] is not None and time.time() >= float(rate_info["reset"])): update_rate_info() reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S") logging.info("API limit for '%s' reset. Calls left: %d (Until %s)" % (rate_info["res_name"], rate_info["remaining"], reset_string)) if (done is False and rate_info["remaining"] <= 0): logging.warning("Twitter API rate limit truncated results for '%s'." % rate_info["res_name"]) break # No more searches. update_rate_info() logging.info("Twitter API calls left...") for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches: reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S") logging.info("'%s': %d (Until %s)." % (rate_info["res_name"], rate_info["remaining"], reset_string)) logging.info("Current Time: %s" % datetime.now().strftime("%Y-%m-%d %H:%M:%S")) except (Exception) as err: logging.exception("Parser failed.") raise common.ParserError("Parser failed.") snarks = sorted(snarks, key=lambda k: k["date"]) # Drop duplicates from multiple passes. snarks = uniquify_list(snarks) if (first_msg): first_index = -1 for i in range(len(snarks)): if (snarks[i]["msg"].find(first_msg) != -1): # Finally reached the expected first msg. first_index = i if (first_index >= 0): snarks = snarks[first_index:] else: logging.warning("first_msg string \"%s\" was not found." % first_msg) snarks = [] return snarks
def run(self): try: while (self.keep_alive): self.nap(self._sleep_interval) if (not self.keep_alive): break while(self.keep_alive): # Keep popping until a valid unexpired tweet is found. line = self._stream_lines.pop_line() if (line is None): break if (len(line) == 0): continue tweet = None try: tweet = json.loads(line) except (TypeError, ValueError) as err: logging.info("Tweet parsing failed: %s" % repr(line)) continue user_clean = None text_clean = None tweet_time = 0 user_is_ignored = False if ("user" in tweet and "screen_name" in tweet["user"]): user_clean = common.asciify(tweet["user"]["screen_name"]) with self._options_lock: if (user_clean in self._ignored_users): user_is_ignored = True if ("text" in tweet): text_clean = common.asciify(common.html_unescape(tweet["text"])) text_clean = re.sub("\r", "", text_clean) text_clean = re.sub("^ +", "", text_clean) text_clean = re.sub("^@[^ ]+ *", "", text_clean, 1) text_clean = re.sub(" *https?://[^ ]+", "", text_clean) text_clean = text_clean.rstrip(" \n") if (re.match("^[? .\n\"]{8,}$", text_clean)): continue # Likely tons of non-ascii chars. Skip. if ("created_at" in tweet): tweet_time = datetime.strptime(tweet["created_at"] +" UTC", '%a %b %d %H:%M:%S +0000 %Y %Z') if (user_clean and text_clean and tweet_time): current_time = datetime.utcnow() lag_delta = (current_time - tweet_time) lag_str = "" if (abs(lag_delta) == lag_delta): # Tweet in past, positive lag. lag_str = "%ds" % lag_delta.seconds elif (lag_delta.days == -1 and (tweet_time - current_time).seconds == 0): lag_str = "0s" # Tweet was only microseconds ahead, call it 0. else: # Tweet in future, negative lag (-1 day, 86400-Nsecs). lag_str = "-%ds" % (tweet_time - current_time).seconds if (lag_delta > self._expire_delta): logging.info("Tweet expired (lag %s): %s: %s" % (lag_str, user_clean, text_clean)) continue elif (user_is_ignored): logging.info("Tweet ignored (lag %s): %s: %s" % (lag_str, user_clean, text_clean)) continue else: logging.info("Tweet shown (lag %s): %s: %s" % (lag_str, user_clean, text_clean)) self._show_message(user_clean, text_clean, tweet) break #logging.info("Time(Current): %s Time(Tweet): %s" % (current_time.strftime("%a %b %d %Y %H:%M:%S"), tweet_time.strftime("%a %b %d %Y %H:%M:%S"))) #logging.info("---") except (Exception) as err: logging.exception("Unexpected exception in %s." % self.__class__.__name__) #raise self.keep_alive = False
def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None): """Collects snarks from an html Transcript post on LousyCanuck's blog. See: http://twitter.com/MockTM See: http://freethoughtblogs.com/lousycanuck/ This parser adds non-standard attributes to snarks: "user_url" and "msg_url", links to the user's twitter page and to the specific tweet. Exporters might disregard this info. :param src_path: A url, or saved html source. :param first_msg: If not None, ignore comments until this substring is found. :param options: A dict of extra options specific to this parser. reply_name (optional): The name to which replies were directed. Regexes will remove it from comments. :param keep_alive_func: Optional replacement to get an abort boolean. :param sleep_func: Optional replacement to sleep N seconds. :return: A List of snark dicts. :raises: ParserError """ if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive if (sleep_func is None): sleep_func = global_config.nap if (not src_path): raise common.ParserError("The %s parser requires the general arg, \"src_path\", to be set." % re.sub(".*[.]", "", __name__)) # Regex to parse tweet info out of html. snark_ptn = re.compile("(?:<p>)?<a href='([^']*)'>([^<]*)</a>: (.*?) +<br ?/><font size=-3><a href='([^']*)'[^>]*>([0-9]{4})-([0-9]{2})-([0-9]{2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})</a></font>(?:<br ?/>|</p>)?", re.IGNORECASE) # List of pattern/replacement tuples to strip reply topic from comments. reply_regexes = [] if (ns+"reply_name" in options and options[ns+"reply_name"]): reply_name_escaped = re.escape(options[ns+"reply_name"]) reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "), (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")] # Regex to know when to stop parsing. tail_ptn = re.compile("<div class=\"[^\"]*robots-nocontent[^\"]*\">") start_date = None snarks = [] lines = [] try: headers = {"User-Agent":"Mozilla/5.0 (Windows NT 5.1; rv:27.0) Gecko/20100101 Firefox/27.0"} req = urllib2.Request(src_path, None, headers=headers) with contextlib.closing(urllib2.urlopen(req)) as snark_file: while (keep_alive_func()): line = snark_file.readline() if (line == ''): break line = re.sub("\r\n?", "\n", line) # Local files are opened without universal newlines. line = line[:-1] lines.append(line) except (urllib2.HTTPError) as err: logging.error("Http status: %d" % err.code) raise common.ParserError("Parser failed.") except (urllib2.URLError) as err: logging.error(str(err)) raise common.ParserError("Parser failed.") for line in lines: if (tail_ptn.search(line) is not None): break result = snark_ptn.match(line) if (result is None): # Only complain once the first snark is found. if (start_date is not None): logging.warning("Bad Line: "+ line) continue snark = {} snark["user"] = result.group(2) snark["msg"] = result.group(3) for reply_ptn, reply_rep in reply_regexes: snark["msg"] = reply_ptn.sub(reply_rep, snark["msg"]) snark["msg"] = common.asciify(common.html_unescape(snark["msg"])) year, month, day = [int(result.group(i)) for i in [5,6,7]] hour, minute, second = [int(result.group(i)) for i in [8,9,10]] # UTC time zone? snark["date"] = datetime(year, month, day, hour, minute, second) snark["user_url"] = result.group(1) snark["msg_url"] = result.group(4) if (start_date is None): if (first_msg and line.find(first_msg) == -1): # This snark was earlier than the expected first msg. continue start_date = snark["date"] snarks.append(snark) return snarks
def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None): """Collects snarks from a Twitter search. Finds tweets from any account and @reply mentions of it. See: https://dev.twitter.com/docs/api/1/get/search This parser adds non-standard attributes to snarks: "user_url" and "msg_url", links to the user's twitter page and to the specific tweet. Exporters might disregard this info. Twitter's search API only reaches back a few days and may be incomplete. :/ :param src_path: Not used. :param first_msg: If not None, ignore comments prior to one containing this substring. :param options: A dict of extra options specific to this parser. reply_name: The name to which replies were directed (no "@"). since_date (optional): UTC Datetime to limit dredging up old tweets. until_date (optional): UTC Datetime to limit dredging up new tweets. :param keep_alive_func: Optional replacement to get an abort boolean. :param sleep_func: Optional replacement to sleep N seconds. :return: A List of snark dicts. :raises: ParserError """ if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive if (sleep_func is None): sleep_func = global_config.nap since_date = None if (ns+"since_date" in options and options[ns+"since_date"]): since_date = options[ns+"since_date"] until_date = None if (ns+"until_date" in options and options[ns+"until_date"]): until_date = options[ns+"until_date"] missing_options = [o for o in ["reply_name"] if ((ns+o) not in options or not options[ns+o])] if (len(missing_options) > 0): logging.error("Required parser options weren't provided: %s." % ", ".join(missing_options)) raise common.ParserError("Parser failed.") snarks = [] tweepy = tweepy_backend.get_tweepy() tweepy_api = tweepy_backend.get_api() try: # List of pattern/replacement tuples to strip reply topic from comments. reply_name_escaped = re.escape(options[ns+"reply_name"]) reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "), (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")] search_args = {"rpp":100, "include_entities":"false", "result_type":"recent"} search_args["q"] = "@%s OR from:%s" % (options[ns+"reply_name"], options[ns+"reply_name"]) if (since_date): search_args["since"] = since_date.strftime("%Y-%m-%d") if (until_date): search_args["until"] = until_date.strftime("%Y-%m-%d") search_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"search", "res_name":"/search/tweets"} searches = [] searches.append(("Search", tweepy_api.search, search_args, 1500, search_rate)) def update_rate_info(): # Sets new rate info values for the searches. rate_status = tweepy_api.rate_limit_status() for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches: rate_info.update(rate_status["resources"][rate_info["res_family"]][rate_info["res_name"]]) update_rate_info() for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches: done = False query_count = 0 results_count = 0 last_max_id = None while (keep_alive_func() and done is False and results_count < search_cap and rate_info["remaining"] > 0): results = tweepy_func(**tweepy_func_args) rate_info["remaining"] -= 1 if (not results): done = True break else: query_count += 1 results_count += len(results) logging.info("%s Query % 2d: % 3d results." % (search_type, query_count, len(results))) last_id = None for search_result in results: if (last_max_id == search_result.id): continue last_id = search_result.id snark = {} snark["user"] = "******" % common.asciify(search_result.from_user) snark["msg"] = search_result.text for (reply_ptn, reply_rep) in reply_regexes: snark["msg"] = reply_ptn.sub(reply_rep, snark["msg"]) snark["msg"] = common.asciify(common.html_unescape(snark["msg"])) snark["date"] = search_result.created_at snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(search_result.from_user) snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(search_result.from_user), search_result.id) if (until_date and snark["date"] > until_date): continue # This snark is too recent. if (since_date and snark["date"] < since_date): done = True # This snark is too early. break snarks.append(snark) if (first_msg): if (snark["msg"].find(first_msg) != -1): done = True # Found the first comment. break if (last_id is not None): # Dig deeper into the past on the next loop. tweepy_func_args["max_id"] = last_id last_max_id = last_id else: # Must've only gotten the "max_id" tweet again. done = True break if (rate_info["reset"] is not None and time.time() >= float(rate_info["reset"])): update_rate_info() reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S") logging.info("API limit for '%s' reset. Calls left: %d (Until %s)" % (rate_info["res_name"], rate_info["remaining"], reset_string)) if (done is False and rate_info["remaining"] <= 0): logging.warning("Twitter API rate limit truncated results for '%s'." % rate_info["res_name"]) break # No more searches. update_rate_info() logging.info("Twitter API calls left...") for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches: reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S") logging.info("'%s': %d (Until %s)." % (rate_info["res_name"], rate_info["remaining"], reset_string)) logging.info("Current Time: %s" % datetime.now().strftime("%Y-%m-%d %H:%M:%S")) except (Exception) as err: logging.exception("Parser failed.") raise common.ParserError("Parser failed.") snarks = sorted(snarks, key=lambda k: k["date"]) # Drop duplicates from multiple passes. snarks = uniquify_list(snarks) if (first_msg): first_index = -1 for i in range(len(snarks)): if (snarks[i]["msg"].find(first_msg) != -1): # Finally reached the expected first msg. first_index = i if (first_index >= 0): snarks = snarks[first_index:] else: logging.warning("first_msg string \"%s\" was not found." % first_msg) snarks = [] return snarks