def make_api_call_for_site(self, site): posts = self.queue.pop(site) self.queue_store_lock.acquire() store_bodyfetcher_queue() self.queue_store_lock.release() if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. min_query = "" if self.last_activity_date != 0: min_query = "&min=" + str(self.last_activity_date) pagesize = "50" else: pagesize = "25" url = "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((&pagesize=" + pagesize + min_query else: url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" # wait to make sure API has/updates post data time.sleep(3) # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: response = requests.get(url, timeout=20).json() except requests.exceptions.Timeout: return # could add some retrying logic here, but eh. self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response["quota_remaining"] - GlobalVars.apiquota >= 1000 and GlobalVars.apiquota >= 0: GlobalVars.charcoal_hq.send_message("API quota rolled over with {} requests remaining.".format(GlobalVars.apiquota)) sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site, quota_used in sorted_calls_per_site: api_quota_used_per_site = api_quota_used_per_site + site.replace('.com', '').replace('.stackexchange', '') + ": " + str(quota_used) + "\n" api_quota_used_per_site = api_quota_used_per_site.strip() GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False) clear_api_data() if response["quota_remaining"] == 0: GlobalVars.charcoal_hq.send_message("API reports no quota left! May be a glitch.") GlobalVars.charcoal_hq.send_message(str(response)) # No code format for now? if GlobalVars.apiquota == -1: GlobalVars.charcoal_hq.send_message("Restart: API quota is {}.".format(response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq = message_hq + " Error: {}.".format(response["error_message"]) if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] message_hq = message_hq + "\n" + "Backoff received of " + str(response["backoff"]) + " seconds." if len(message_hq) > 0: GlobalVars.charcoal_hq.send_message(message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": if len(response["items"]) > 0 and "last_activity_date" in response["items"][0]: self.last_activity_date = response["items"][0]["last_activity_date"] for post in response["items"]: if "title" not in post or "body" not in post: continue title = GlobalVars.parser.unescape(post["title"]) body = GlobalVars.parser.unescape(post["body"]) link = post["link"] post_score = post["score"] up_vote_count = post["up_vote_count"] down_vote_count = post["down_vote_count"] try: owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"]) owner_link = post["owner"]["link"] owner_rep = post["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 q_id = str(post["question_id"]) is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False, owner_rep, post_score) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why, owner_rep, post_score, up_vote_count, down_vote_count) except: print "NOP" try: for answer in post["answers"]: answer_title = "" body = answer["body"] print "got answer from owner with name " + owner_name link = answer["link"] a_id = str(answer["answer_id"]) post_score = answer["score"] up_vote_count = answer["up_vote_count"] down_vote_count = answer["down_vote_count"] try: owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"]) owner_link = answer["owner"]["link"] owner_rep = answer["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 is_spam, reason, why = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False, owner_rep, post_score) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why, owner_rep, post_score, up_vote_count, down_vote_count) except: print "NOP" except: print "no answers" return
def make_api_call_for_site(self, site): if site not in self.queue: return self.queue_modify_lock.acquire() new_posts = self.queue.pop(site) store_bodyfetcher_queue() self.queue_modify_lock.release() new_post_ids = [int(k) for k in new_posts.keys()] if GlobalVars.flovis is not None: for post_id in new_post_ids: GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id, {'site': site, 'posts': list(new_posts.keys())}) self.queue_timing_modify_lock.acquire() post_add_times = [v for k, v in new_posts.items()] pop_time = datetime.utcnow() for add_time in post_add_times: try: seconds_in_queue = (pop_time - add_time).total_seconds() if site in self.queue_timings: self.queue_timings[site].append(seconds_in_queue) else: self.queue_timings[site] = [seconds_in_queue] except KeyError: # XXX: Any other possible exception? continue # Skip to next item if we've got invalid data or missing values. store_queue_timings() self.queue_timing_modify_lock.release() self.max_ids_modify_lock.acquire() if site in self.previous_max_ids and max(new_post_ids) > self.previous_max_ids[site]: previous_max_id = self.previous_max_ids[site] intermediate_posts = range(previous_max_id + 1, max(new_post_ids)) # We don't want to go over the 100-post API cutoff, so take the last # (100-len(new_post_ids)) from intermediate_posts intermediate_posts = intermediate_posts[(100 - len(new_post_ids)):] # new_post_ids could contain edited posts, so merge it back in combined = chain(intermediate_posts, new_post_ids) # Could be duplicates, so uniquify posts = list(set(combined)) else: posts = new_post_ids try: if max(new_post_ids) > self.previous_max_ids[site]: self.previous_max_ids[site] = max(new_post_ids) store_bodyfetcher_max_ids() except KeyError: self.previous_max_ids[site] = max(new_post_ids) store_bodyfetcher_max_ids() self.max_ids_modify_lock.release() log('debug', "New IDs / Hybrid Intermediate IDs for {}:".format(site)) if len(new_post_ids) > 30: log('debug', "{} +{} more".format(sorted(new_post_ids)[:30], len(new_post_ids) - 30)) else: log('debug', sorted(new_post_ids)) if len(new_post_ids) == len(posts): log('debug', "[ *Identical* ]") elif len(posts) > 30: log('debug', "{} +{} more".format(sorted(posts)[:30], len(posts) - 30)) else: log('debug', sorted(posts)) question_modifier = "" pagesize_modifier = "" if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. if self.last_activity_date != 0: pagesize = "50" else: pagesize = "25" pagesize_modifier = "&pagesize={pagesize}" \ "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date)) else: question_modifier = "/{0}".format(";".join([str(post) for post in posts])) url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \ "&filter=!*xq08dCDNr)PlxxXfaN8ntivx(BPlY_8XASyXLX-J7F-)VK*Q3KTJVkvp*&key=IAkbitmze4B8KpacUfLqkw((" \ "{optional_min_query_param}".format(q_modifier=question_modifier, site=site, optional_min_query_param=pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) GlobalVars.api_request_lock.acquire() # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: time_request_made = datetime.now().strftime('%H:%M:%S') response = requests.get(url, timeout=20).json() except (requests.exceptions.Timeout, requests.ConnectionError, Exception): # Any failure in the request being made (timeout or otherwise) should be added back to # the queue. self.queue_modify_lock.acquire() if site in self.queue: self.queue[site].update(new_posts) else: self.queue[site] = new_posts self.queue_modify_lock.release() GlobalVars.api_request_lock.release() return self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0: tell_rooms_with("debug", "API quota rolled over with {0} requests remaining. " "Current quota: {1}.".format(GlobalVars.apiquota, response["quota_remaining"])) sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '') api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip() tell_rooms_with("debug", api_quota_used_per_site) clear_api_data() if response["quota_remaining"] == 0: tell_rooms_with("debug", "API reports no quota left! May be a glitch.") tell_rooms_with("debug", str(response)) # No code format for now? if GlobalVars.apiquota == -1: tell_rooms_with("debug", "Restart: API quota is {quota}." .format(quota=response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made) if "error_id" in response and response["error_id"] == 502: if GlobalVars.api_backoff_time < time.time() + 12: # Add a backoff of 10 + 2 seconds as a default GlobalVars.api_backoff_time = time.time() + 12 message_hq += " Backing off on requests for the next 12 seconds." message_hq += " Previous URL: `{}`".format(url) if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] GlobalVars.api_request_lock.release() if len(message_hq) > 0: tell_rooms_with("debug", message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: self.last_activity_date = items[0]["last_activity_date"] num_scanned = 0 start_time = time.time() for post in response["items"]: pnb = copy.deepcopy(post) if 'body' in pnb: pnb['body'] = 'Present, but truncated' if 'answers' in pnb: del pnb['answers'] if "title" not in post or "body" not in post: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/no_content', site, post['question_id'], pnb) continue post['site'] = site try: post['edited'] = (post['creation_date'] != post['last_edit_date']) except KeyError: post['edited'] = False # last_edit_date not present = not edited try: post_ = Post(api_response=post) except PostParseError as err: log('error', 'Error {0} when parsing post: {1!r}'.format(err, post_)) if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/error', site, post['question_id'], pnb) continue num_scanned += 1 is_spam, reason, why = check_if_spam(post_) if is_spam: try: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, post['question_id'], {'post': pnb, 'check_if_spam': [is_spam, reason, why]}) handle_spam(post=post_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, post['question_id'], {'post': pnb, 'check_if_spam': [is_spam, reason, why]}) try: if "answers" not in post: pass else: for answer in post["answers"]: anb = copy.deepcopy(answer) if 'body' in anb: anb['body'] = 'Present, but truncated' num_scanned += 1 answer["IsAnswer"] = True # Necesssary for Post object answer["title"] = "" # Necessary for proper Post object creation answer["site"] = site # Necessary for proper Post object creation try: answer['edited'] = (answer['creation_date'] != answer['last_edit_date']) except KeyError: answer['edited'] = False # last_edit_date not present = not edited answer_ = Post(api_response=answer, parent=post_) is_spam, reason, why = check_if_spam(answer_) if is_spam: try: if GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, answer['answer_id'], {'post': anb, 'check_if_spam': [is_spam, reason, why]}) handle_spam(answer_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, answer['answer_id'], {'post': anb, 'check_if_spam': [is_spam, reason, why]}) except Exception as e: log('error', "Exception handling answers:", e) end_time = time.time() GlobalVars.posts_scan_stats_lock.acquire() GlobalVars.num_posts_scanned += num_scanned GlobalVars.post_scan_time += end_time - start_time GlobalVars.posts_scan_stats_lock.release() return
def make_api_call_for_site(self, site): posts = self.queue.pop(site) store_bodyfetcher_queue() if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. min_query = "" if self.last_activity_date != 0: min_query = "&min=" + str(self.last_activity_date) pagesize = "50" else: pagesize = "25" url = "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw((&pagesize=" + pagesize + min_query else: url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw((" # wait to make sure API has/updates post data time.sleep(60) try: response = requests.get(url, timeout=20).json() except requests.exceptions.Timeout: return # could add some retrying logic here, but eh. add_or_update_api_data(site) if "quota_remaining" in response: if response["quota_remaining"] - GlobalVars.apiquota >= 1000 and GlobalVars.apiquota >= 0: GlobalVars.charcoal_hq.send_message("API quota rolled over with {} requests remaining.".format(GlobalVars.apiquota)) sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site, quota_used in sorted_calls_per_site: api_quota_used_per_site = api_quota_used_per_site + site.replace('.com', '').replace('.stackexchange', '') + ": " + str(quota_used) + "\n" api_quota_used_per_site = api_quota_used_per_site.strip() GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False) clear_api_data() elif response["quota_remaining"] == 0: GlobalVars.charcoal_hq.send_message("API reports no quota left! May be a glitch.") GlobalVars.apiquota = response["quota_remaining"] else: GlobalVars.charcoal_hq.send_message("The quota_remaining property was not in the API response.") if site == "stackoverflow.com": if len(response["items"]) > 0 and "last_activity_date" in response["items"][0]: self.last_activity_date = response["items"][0]["last_activity_date"] for post in response["items"]: if "title" not in post or "body" not in post: continue title = GlobalVars.parser.unescape(post["title"]) body = GlobalVars.parser.unescape(post["body"]) link = post["link"] try: owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"]) owner_link = post["owner"]["link"] owner_rep = post["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 q_id = str(post["question_id"]) is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False, owner_rep) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why) except: print "NOP" try: for answer in post["answers"]: answer_title = "" body = answer["body"] print "got answer from owner with name " + owner_name link = answer["link"] a_id = str(answer["answer_id"]) try: owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"]) owner_link = answer["owner"]["link"] owner_rep = answer["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 is_spam, reason, why = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False, owner_rep) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why) except: print "NOP" except: print "no answers" return
def make_api_call_for_site(self, site): if site not in self.queue: return self.queue_modify_lock.acquire() posts = self.queue.pop(site) store_bodyfetcher_queue() self.queue_modify_lock.release() question_modifier = "" pagesize_modifier = "" if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. if self.last_activity_date != 0: pagesize = "50" else: pagesize = "25" pagesize_modifier = "&pagesize={pagesize}" \ "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date)) else: question_modifier = "/{0}".format(";".join( str(post) for post in posts)) url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \ "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" \ "{optional_min_query_param}".format(q_modifier=question_modifier, site=site, optional_min_query_param=pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) GlobalVars.api_request_lock.acquire() # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: time_request_made = datetime.now().strftime('%H:%M:%S') response = requests.get(url, timeout=20).json() except (requests.exceptions.Timeout, requests.ConnectionError, Exception): # Any failure in the request being made (timeout or otherwise) should be added back to # the queue. self.queue_modify_lock.acquire() if site in self.queue: self.queue[site].extend(posts) else: self.queue[site] = posts self.queue_modify_lock.release() return self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response[ "quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0: GlobalVars.charcoal_hq.send_message( "API quota rolled over with {0} requests remaining. " "Current quota: {1}.".format(GlobalVars.apiquota, response["quota_remaining"])) sorted_calls_per_site = sorted( GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: sanatized_site_name = site_name.replace( '.com', '').replace('.stackexchange', '') api_quota_used_per_site += sanatized_site_name + ": {0}\n".format( str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip() GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False) clear_api_data() if response["quota_remaining"] == 0: GlobalVars.charcoal_hq.send_message( "API reports no quota left! May be a glitch.") GlobalVars.charcoal_hq.send_message( str(response)) # No code format for now? if GlobalVars.apiquota == -1: GlobalVars.charcoal_hq.send_message( "Restart: API quota is {quota}.".format( quota=response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {} at {} UTC.".format( response["error_message"], time_request_made) if "error_id" in response and response["error_id"] == 502: if GlobalVars.api_backoff_time < time.time( ) + 12: # Add a backoff of 10 + 2 seconds as a default GlobalVars.api_backoff_time = time.time() + 12 message_hq += " Backing off on requests for the next 12 seconds." if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] GlobalVars.api_request_lock.release() if len(message_hq) > 0: GlobalVars.charcoal_hq.send_message(message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: self.last_activity_date = items[0]["last_activity_date"] num_scanned = 0 start_time = time.time() for post in response["items"]: if "title" not in post or "body" not in post: continue num_scanned += 1 title = GlobalVars.parser.unescape(post["title"]) body = GlobalVars.parser.unescape(post["body"]) link = post["link"] post_score = post["score"] up_vote_count = post["up_vote_count"] down_vote_count = post["down_vote_count"] try: owner_name = GlobalVars.parser.unescape( post["owner"]["display_name"]) owner_link = post["owner"]["link"] owner_rep = post["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 q_id = str(post["question_id"]) is_spam, reason, why = check_if_spam(title=title, body=body, user_name=owner_name, user_url=owner_link, post_site=site, post_id=q_id, is_answer=False, body_is_summary=False, owner_rep=owner_rep, post_score=post_score) if is_spam: try: handle_spam(title=title, body=body, poster=owner_name, site=site, post_url=link, poster_url=owner_link, post_id=q_id, reasons=reason, is_answer=False, why=why, owner_rep=owner_rep, post_score=post_score, up_vote_count=up_vote_count, down_vote_count=down_vote_count, question_id=None) except: print "NOP" try: for answer in post["answers"]: num_scanned += 1 answer_title = "" body = answer["body"] print "got answer from owner with name " + owner_name link = answer["link"] a_id = str(answer["answer_id"]) post_score = answer["score"] up_vote_count = answer["up_vote_count"] down_vote_count = answer["down_vote_count"] try: owner_name = GlobalVars.parser.unescape( answer["owner"]["display_name"]) owner_link = answer["owner"]["link"] owner_rep = answer["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 is_spam, reason, why = check_if_spam(title=answer_title, body=body, user_name=owner_name, user_url=owner_link, post_site=site, post_id=a_id, is_answer=True, body_is_summary=False, owner_rep=owner_rep, post_score=post_score) if is_spam: try: handle_spam(title=title, body=body, poster=owner_name, site=site, post_url=link, poster_url=owner_link, post_id=a_id, reasons=reason, is_answer=True, why=why, owner_rep=owner_rep, post_score=post_score, up_vote_count=up_vote_count, down_vote_count=down_vote_count, question_id=q_id) except: print "NOP" except: print "no answers" end_time = time.time() GlobalVars.posts_scan_stats_lock.acquire() GlobalVars.num_posts_scanned += num_scanned GlobalVars.post_scan_time += end_time - start_time GlobalVars.posts_scan_stats_lock.release() return
def make_api_call_for_site(self, site): with self.queue_lock: new_posts = self.queue.pop(site, None) if new_posts is None: # site was not in the queue return Tasks.do(store_bodyfetcher_queue) new_post_ids = [int(k) for k in new_posts.keys()] if GlobalVars.flovis is not None: for post_id in new_post_ids: GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id, { 'site': site, 'posts': list(new_posts.keys()) }) # Add queue timing data pop_time = datetime.utcnow() post_add_times = [(pop_time - v).total_seconds() for k, v in new_posts.items()] Tasks.do(add_queue_timing_data, site, post_add_times) store_max_ids = False with self.max_ids_modify_lock: if site in self.previous_max_ids and max( new_post_ids) > self.previous_max_ids[site]: previous_max_id = self.previous_max_ids[site] intermediate_posts = range(previous_max_id + 1, max(new_post_ids)) # We don't want to go over the 100-post API cutoff, so take the last # (100-len(new_post_ids)) from intermediate_posts intermediate_posts = intermediate_posts[-(100 - len(new_post_ids)):] # new_post_ids could contain edited posts, so merge it back in combined = chain(intermediate_posts, new_post_ids) # Could be duplicates, so uniquify posts = list(set(combined)) else: posts = new_post_ids new_post_ids_max = max(new_post_ids) if new_post_ids_max > self.previous_max_ids.get(site, 0): self.previous_max_ids[site] = new_post_ids_max store_max_ids = True if store_max_ids: schedule_store_bodyfetcher_max_ids() log('debug', "New IDs / Hybrid Intermediate IDs for {}:".format(site)) if len(new_post_ids) > 30: log( 'debug', "{} +{} more".format( sorted(new_post_ids)[:30], len(new_post_ids) - 30)) else: log('debug', sorted(new_post_ids)) if len(new_post_ids) == len(posts): log('debug', "[ *Identical* ]") elif len(posts) > 30: log('debug', "{} +{} more".format(sorted(posts)[:30], len(posts) - 30)) else: log('debug', sorted(posts)) question_modifier = "" pagesize_modifier = {} if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. with self.last_activity_date_lock: if self.last_activity_date != 0: pagesize = "100" else: pagesize = "50" pagesize_modifier = { 'pagesize': pagesize, 'min': str(self.last_activity_date - self.ACTIVITY_DATE_EXTRA_EARLIER_MS_TO_FETCH) } else: question_modifier = "/{0}".format(";".join( [str(post) for post in posts])) url = "https://api.stackexchange.com/2.2/questions{}".format( question_modifier) params = { 'filter': '!1rs)sUKylwB)8isvCRk.xNu71LnaxjnPS12*pX*CEOKbPFwVFdHNxiMa7GIVgzDAwMa', 'key': 'IAkbitmze4B8KpacUfLqkw((', 'site': site } params.update(pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) with GlobalVars.api_request_lock: # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: time_request_made = datetime.utcnow().strftime('%H:%M:%S') response = requests.get(url, params=params, timeout=20).json() except (requests.exceptions.Timeout, requests.ConnectionError, Exception): # Any failure in the request being made (timeout or otherwise) should be added back to # the queue. with self.queue_lock: if site in self.queue: self.queue[site].update(new_posts) else: self.queue[site] = new_posts return with self.api_data_lock: add_or_update_api_data(site) message_hq = "" with GlobalVars.apiquota_rw_lock: if "quota_remaining" in response: quota_remaining = response["quota_remaining"] if quota_remaining - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0 \ and quota_remaining > 39980: tell_rooms_with( "debug", "API quota rolled over with {0} requests remaining. " "Current quota: {1}.".format( GlobalVars.apiquota, quota_remaining)) sorted_calls_per_site = sorted( GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: sanatized_site_name = site_name.replace( '.com', '').replace('.stackexchange', '') api_quota_used_per_site += sanatized_site_name + ": {0}\n".format( str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip( ) tell_rooms_with("debug", api_quota_used_per_site) clear_api_data() if quota_remaining == 0: tell_rooms_with( "debug", "API reports no quota left! May be a glitch.") tell_rooms_with( "debug", str(response)) # No code format for now? if GlobalVars.apiquota == -1: tell_rooms_with( "debug", "Restart: API quota is {quota}.".format( quota=quota_remaining)) GlobalVars.apiquota = quota_remaining else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {} at {} UTC.".format( response["error_message"], time_request_made) if "error_id" in response and response["error_id"] == 502: if GlobalVars.api_backoff_time < time.time( ) + 12: # Add a backoff of 10 + 2 seconds as a default GlobalVars.api_backoff_time = time.time() + 12 message_hq += " Backing off on requests for the next 12 seconds." message_hq += " Previous URL: `{}`".format(url) if "backoff" in response: if GlobalVars.api_backoff_time < time.time( ) + response["backoff"]: GlobalVars.api_backoff_time = time.time( ) + response["backoff"] if len(message_hq) > 0 and "site is required" not in message_hq: message_hq = message_hq.strip() if len(message_hq) > 500: message_hq = "\n" + message_hq tell_rooms_with("debug", message_hq) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: with self.last_activity_date_lock: self.last_activity_date = items[0]["last_activity_date"] num_scanned = 0 start_time = time.time() for post in response["items"]: if GlobalVars.flovis is not None: pnb = copy.deepcopy(post) if 'body' in pnb: pnb['body'] = 'Present, but truncated' if 'answers' in pnb: del pnb['answers'] if "title" not in post or "body" not in post: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage( 'bodyfetcher/api_response/no_content', site, post['question_id'], pnb) continue post['site'] = site try: post['edited'] = (post['creation_date'] != post['last_edit_date']) except KeyError: post[ 'edited'] = False # last_edit_date not present = not edited question_doesnt_need_scan = is_post_recently_scanned_and_unchanged( post) add_recently_scanned_post(post) if not question_doesnt_need_scan: try: post_ = Post(api_response=post) except PostParseError as err: log( 'error', 'Error {0} when parsing post: {1!r}'.format( err, post_)) if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage( 'bodyfetcher/api_response/error', site, post['question_id'], pnb) continue num_scanned += 1 is_spam, reason, why = check_if_spam(post_) if is_spam: try: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage( 'bodyfetcher/api_response/spam', site, post['question_id'], { 'post': pnb, 'check_if_spam': [is_spam, reason, why] }) handle_spam(post=post_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage( 'bodyfetcher/api_response/not_spam', site, post['question_id'], { 'post': pnb, 'check_if_spam': [is_spam, reason, why] }) try: if "answers" not in post: pass else: for answer in post["answers"]: if GlobalVars.flovis is not None: anb = copy.deepcopy(answer) if 'body' in anb: anb['body'] = 'Present, but truncated' num_scanned += 1 answer["IsAnswer"] = True # Necesssary for Post object answer[ "title"] = "" # Necessary for proper Post object creation answer[ "site"] = site # Necessary for proper Post object creation try: answer['edited'] = (answer['creation_date'] != answer['last_edit_date']) except KeyError: answer[ 'edited'] = False # last_edit_date not present = not edited answer_doesnt_need_scan = is_post_recently_scanned_and_unchanged( answer) add_recently_scanned_post(answer) if answer_doesnt_need_scan: continue answer_ = Post(api_response=answer, parent=post_) is_spam, reason, why = check_if_spam(answer_) if is_spam: try: if GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage( 'bodyfetcher/api_response/spam', site, answer['answer_id'], { 'post': anb, 'check_if_spam': [is_spam, reason, why] }) handle_spam(answer_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage( 'bodyfetcher/api_response/not_spam', site, answer['answer_id'], { 'post': anb, 'check_if_spam': [is_spam, reason, why] }) except Exception as e: log('error', "Exception handling answers:", e) end_time = time.time() scan_time = end_time - start_time GlobalVars.PostScanStat.add_stat(num_scanned, scan_time) return
def make_api_call_for_site(self, site): if site not in self.queue: return self.queue_modify_lock.acquire() new_posts = self.queue.pop(site) store_bodyfetcher_queue() self.queue_modify_lock.release() new_post_ids = [int(k) for k, v in new_posts.items()] if GlobalVars.flovis is not None: for post_id in new_post_ids: GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id, {'queue': dict([[sk, [k for k, v in sq.items()]] for sk, sq in self.queue.items()]), 'site': site, 'posts': [k for k, v in new_posts.items()]}) self.queue_timing_modify_lock.acquire() post_add_times = [v for k, v in new_posts.items()] pop_time = datetime.utcnow() for add_time in post_add_times: try: seconds_in_queue = (pop_time - add_time).total_seconds() if site in self.queue_timings: self.queue_timings[site].append(seconds_in_queue) else: self.queue_timings[site] = [seconds_in_queue] except: continue # Skip to next item if we've got invalid data or missing values. store_queue_timings() self.queue_timing_modify_lock.release() self.max_ids_modify_lock.acquire() if site in self.previous_max_ids and max(new_post_ids) > self.previous_max_ids[site]: previous_max_id = self.previous_max_ids[site] intermediate_posts = range(previous_max_id + 1, max(new_post_ids)) # We don't want to go over the 100-post API cutoff, so take the last # (100-len(new_post_ids)) from intermediate_posts intermediate_posts = intermediate_posts[(100 - len(new_post_ids)):] # new_post_ids could contain edited posts, so merge it back in combined = chain(intermediate_posts, new_post_ids) # Could be duplicates, so uniquify posts = list(set(combined)) else: posts = new_post_ids try: if max(new_post_ids) > self.previous_max_ids[site]: self.previous_max_ids[site] = max(new_post_ids) store_bodyfetcher_max_ids() except KeyError: self.previous_max_ids[site] = max(new_post_ids) store_bodyfetcher_max_ids() self.max_ids_modify_lock.release() log('debug', "New IDs / Hybrid Intermediate IDs for {0}:".format(site)) log('debug', sorted(new_post_ids)) log('debug', sorted(posts)) question_modifier = "" pagesize_modifier = "" if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. if self.last_activity_date != 0: pagesize = "50" else: pagesize = "25" pagesize_modifier = "&pagesize={pagesize}" \ "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date)) else: question_modifier = "/{0}".format(";".join(str(post) for post in posts)) url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \ "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" \ "{optional_min_query_param}".format(q_modifier=question_modifier, site=site, optional_min_query_param=pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) GlobalVars.api_request_lock.acquire() # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: time_request_made = datetime.now().strftime('%H:%M:%S') response = requests.get(url, timeout=20).json() except (requests.exceptions.Timeout, requests.ConnectionError, Exception): # Any failure in the request being made (timeout or otherwise) should be added back to # the queue. self.queue_modify_lock.acquire() if site in self.queue: self.queue[site].update(new_posts) else: self.queue[site] = new_posts self.queue_modify_lock.release() GlobalVars.api_request_lock.release() return self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0: tell_rooms_with("debug", "API quota rolled over with {0} requests remaining. " "Current quota: {1}.".format(GlobalVars.apiquota, response["quota_remaining"])) sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '') api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip() tell_rooms_with("debug", api_quota_used_per_site) clear_api_data() if response["quota_remaining"] == 0: tell_rooms_with("debug", "API reports no quota left! May be a glitch.") tell_rooms_with("debug", str(response)) # No code format for now? if GlobalVars.apiquota == -1: tell_rooms_with("debug", "Restart: API quota is {quota}." .format(quota=response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made) if "error_id" in response and response["error_id"] == 502: if GlobalVars.api_backoff_time < time.time() + 12: # Add a backoff of 10 + 2 seconds as a default GlobalVars.api_backoff_time = time.time() + 12 message_hq += " Backing off on requests for the next 12 seconds." message_hq += " Previous URL: `{}`".format(url) if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] GlobalVars.api_request_lock.release() if len(message_hq) > 0: tell_rooms_with("debug", message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: self.last_activity_date = items[0]["last_activity_date"] num_scanned = 0 start_time = time.time() for post in response["items"]: if "title" not in post or "body" not in post: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/no_content', site, post['question_id'], post) continue post['site'] = site try: post_ = Post(api_response=post) except PostParseError as err: log('error', 'Error {0} when parsing post: {1!r}'.format(err, post_)) if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/error', site, post['question_id'], post) continue num_scanned += 1 is_spam, reason, why = check_if_spam(post_) if is_spam: try: if GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, post['question_id'], {'post': post, 'check_if_spam': [is_spam, reason, why]}) handle_spam(post=post_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'question_id' in post: GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, post['question_id'], {'post': post, 'check_if_spam': [is_spam, reason, why]}) try: if "answers" not in post: pass else: for answer in post["answers"]: num_scanned += 1 answer["IsAnswer"] = True # Necesssary for Post object answer["title"] = "" # Necessary for proper Post object creation answer["site"] = site # Necessary for proper Post object creation answer_ = Post(api_response=answer, parent=post_) is_spam, reason, why = check_if_spam(answer_) if is_spam: try: if GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, answer['answer_id'], {'post': answer, 'check_if_spam': [is_spam, reason, why]}) handle_spam(answer_, reasons=reason, why=why) except Exception as e: log('error', "Exception in handle_spam:", e) elif GlobalVars.flovis is not None and 'answer_id' in answer: GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, answer['answer_id'], {'post': answer, 'check_if_spam': [is_spam, reason, why]}) except Exception as e: log('error', "Exception handling answers:", e) end_time = time.time() GlobalVars.posts_scan_stats_lock.acquire() GlobalVars.num_posts_scanned += num_scanned GlobalVars.post_scan_time += end_time - start_time GlobalVars.posts_scan_stats_lock.release() return
def make_api_call_for_site(self, site): if site not in self.queue: return self.queue_modify_lock.acquire() posts = self.queue.pop(site) store_bodyfetcher_queue() self.queue_modify_lock.release() question_modifier = "" pagesize_modifier = "" if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. if self.last_activity_date != 0: pagesize = "50" else: pagesize = "25" pagesize_modifier = "&pagesize={pagesize}" \ "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date)) else: question_modifier = "/{0}".format(";".join(str(post) for post in posts)) url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \ "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" \ "{optional_min_query_param}".format(q_modifier=question_modifier, site=site, optional_min_query_param=pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) GlobalVars.api_request_lock.acquire() # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: time_request_made = datetime.now().strftime('%H:%M:%S') response = requests.get(url, timeout=20).json() except (requests.exceptions.Timeout, requests.ConnectionError, Exception): # Any failure in the request being made (timeout or otherwise) should be added back to # the queue. self.queue_modify_lock.acquire() if site in self.queue: self.queue[site].extend(posts) else: self.queue[site] = posts self.queue_modify_lock.release() return self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0: GlobalVars.charcoal_hq.send_message("API quota rolled over with {0} requests remaining. " "Current quota: {1}.".format(GlobalVars.apiquota, response["quota_remaining"])) sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '') api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip() GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False) clear_api_data() if response["quota_remaining"] == 0: GlobalVars.charcoal_hq.send_message("API reports no quota left! May be a glitch.") GlobalVars.charcoal_hq.send_message(str(response)) # No code format for now? if GlobalVars.apiquota == -1: GlobalVars.charcoal_hq.send_message("Restart: API quota is {quota}." .format(quota=response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made) if "error_id" in response and response["error_id"] == 502: if GlobalVars.api_backoff_time < time.time() + 12: # Add a backoff of 10 + 2 seconds as a default GlobalVars.api_backoff_time = time.time() + 12 message_hq += " Backing off on requests for the next 12 seconds." if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] GlobalVars.api_request_lock.release() if len(message_hq) > 0: GlobalVars.charcoal_hq.send_message(message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: self.last_activity_date = items[0]["last_activity_date"] num_scanned = 0 start_time = time.time() for post in response["items"]: if "title" not in post or "body" not in post: continue num_scanned += 1 title = GlobalVars.parser.unescape(post["title"]) body = GlobalVars.parser.unescape(post["body"]) link = post["link"] post_score = post["score"] up_vote_count = post["up_vote_count"] down_vote_count = post["down_vote_count"] try: owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"]) owner_link = post["owner"]["link"] owner_rep = post["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 q_id = str(post["question_id"]) is_spam, reason, why = check_if_spam(title=title, body=body, user_name=owner_name, user_url=owner_link, post_site=site, post_id=q_id, is_answer=False, body_is_summary=False, owner_rep=owner_rep, post_score=post_score) if is_spam: try: handle_spam(title=title, body=body, poster=owner_name, site=site, post_url=link, poster_url=owner_link, post_id=q_id, reasons=reason, is_answer=False, why=why, owner_rep=owner_rep, post_score=post_score, up_vote_count=up_vote_count, down_vote_count=down_vote_count, question_id=None) except: print "NOP" try: for answer in post["answers"]: num_scanned += 1 answer_title = "" body = answer["body"] print "got answer from owner with name " + owner_name link = answer["link"] a_id = str(answer["answer_id"]) post_score = answer["score"] up_vote_count = answer["up_vote_count"] down_vote_count = answer["down_vote_count"] try: owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"]) owner_link = answer["owner"]["link"] owner_rep = answer["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 is_spam, reason, why = check_if_spam(title=answer_title, body=body, user_name=owner_name, user_url=owner_link, post_site=site, post_id=a_id, is_answer=True, body_is_summary=False, owner_rep=owner_rep, post_score=post_score) if is_spam: try: handle_spam(title=title, body=body, poster=owner_name, site=site, post_url=link, poster_url=owner_link, post_id=a_id, reasons=reason, is_answer=True, why=why, owner_rep=owner_rep, post_score=post_score, up_vote_count=up_vote_count, down_vote_count=down_vote_count, question_id=q_id) except: print "NOP" except: print "no answers" end_time = time.time() GlobalVars.posts_scan_stats_lock.acquire() GlobalVars.num_posts_scanned += num_scanned GlobalVars.post_scan_time += end_time - start_time GlobalVars.posts_scan_stats_lock.release() return
def make_api_call_for_site(self, site): self.queue_modify_lock.acquire() if site not in self.queue: GlobalVars.charcoal_hq.send_message( "Attempted API call to {} but there are no posts to fetch.". format(site)) return posts = self.queue.pop(site) store_bodyfetcher_queue() self.queue_modify_lock.release() question_modifier = "" pagesize_modifier = "" if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. if self.last_activity_date != 0: pagesize = "50" else: pagesize = "25" pagesize_modifier = "&pagesize={pagesize}&min={time_length}".format( pagesize=pagesize, time_length=str(self.last_activity_date)) else: question_modifier = "/{0}".format(";".join( str(post) for post in posts)) url = "http://api.stackexchange.com/2.2/questions{q_modifier}?site={site}&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw(({optional_min_query_param}".format( q_modifier=question_modifier, site=site, optional_min_query_param=pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: response = requests.get(url, timeout=20).json() except requests.exceptions.Timeout: return # could add some retrying logic here, but eh. self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response[ "quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0: GlobalVars.charcoal_hq.send_message( "API quota rolled over with {0} requests remaining. Current quota: {1}." .format(GlobalVars.apiquota, response["quota_remaining"])) sorted_calls_per_site = sorted( GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: api_quota_used_per_site += site_name.replace( '.com', '').replace('.stackexchange', '') + ": {0}\n".format( str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip() GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False) clear_api_data() if response["quota_remaining"] == 0: GlobalVars.charcoal_hq.send_message( "API reports no quota left! May be a glitch.") GlobalVars.charcoal_hq.send_message( str(response)) # No code format for now? if GlobalVars.apiquota == -1: GlobalVars.charcoal_hq.send_message( "Restart: API quota is {quota}.".format( quota=response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {}.".format(response["error_message"]) if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] match = regex.compile('/2.2/([^.]*)').search(url) url_part = match.group(1) if match else url message_hq += "\nBackoff received of {} seconds on request to `{}`".format( str(response["backoff"]), url_part) if len(message_hq) > 0: GlobalVars.charcoal_hq.send_message(message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: self.last_activity_date = items[0]["last_activity_date"] for post in response["items"]: if "title" not in post or "body" not in post: continue title = GlobalVars.parser.unescape(post["title"]) body = GlobalVars.parser.unescape(post["body"]) link = post["link"] post_score = post["score"] up_vote_count = post["up_vote_count"] down_vote_count = post["down_vote_count"] try: owner_name = GlobalVars.parser.unescape( post["owner"]["display_name"]) owner_link = post["owner"]["link"] owner_rep = post["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 q_id = str(post["question_id"]) is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False, owner_rep, post_score) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why, owner_rep, post_score, up_vote_count, down_vote_count, None) except: print "NOP" try: for answer in post["answers"]: answer_title = "" body = answer["body"] print "got answer from owner with name " + owner_name link = answer["link"] a_id = str(answer["answer_id"]) post_score = answer["score"] up_vote_count = answer["up_vote_count"] down_vote_count = answer["down_vote_count"] try: owner_name = GlobalVars.parser.unescape( answer["owner"]["display_name"]) owner_link = answer["owner"]["link"] owner_rep = answer["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 is_spam, reason, why = check_if_spam( answer_title, body, owner_name, owner_link, site, a_id, True, False, owner_rep, post_score) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why, owner_rep, post_score, up_vote_count, down_vote_count, q_id) except: print "NOP" except: print "no answers" return
def make_api_call_for_site(self, site): posts = self.queue.pop(site) self.queue_store_lock.acquire() store_bodyfetcher_queue() self.queue_store_lock.release() if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. min_query = "" if self.last_activity_date != 0: min_query = "&min=" + str(self.last_activity_date) pagesize = "50" else: pagesize = "25" url = "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((&pagesize=" + pagesize + min_query else: url = "http://api.stackexchange.com/2.2/questions/" + ";".join( str(x) for x in posts ) + "?site=" + site + "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" # wait to make sure API has/updates post data time.sleep(3) # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: response = requests.get(url, timeout=20).json() except requests.exceptions.Timeout: return # could add some retrying logic here, but eh. self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response[ "quota_remaining"] - GlobalVars.apiquota >= 1000 and GlobalVars.apiquota >= 0: GlobalVars.charcoal_hq.send_message( "API quota rolled over with {} requests remaining.".format( GlobalVars.apiquota)) sorted_calls_per_site = sorted( GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site, quota_used in sorted_calls_per_site: api_quota_used_per_site = api_quota_used_per_site + site.replace( '.com', '').replace('.stackexchange', '') + ": " + str(quota_used) + "\n" api_quota_used_per_site = api_quota_used_per_site.strip() GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False) clear_api_data() if response["quota_remaining"] == 0: GlobalVars.charcoal_hq.send_message( "API reports no quota left! May be a glitch.") GlobalVars.charcoal_hq.send_message( str(response)) # No code format for now? if GlobalVars.apiquota == -1: GlobalVars.charcoal_hq.send_message( "Restart: API quota is {}.".format( response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq = message_hq + " Error: {}.".format( response["error_message"]) if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] message_hq = message_hq + "\n" + "Backoff received of " + str( response["backoff"]) + " seconds." if len(message_hq) > 0: GlobalVars.charcoal_hq.send_message(message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": if len(response["items"] ) > 0 and "last_activity_date" in response["items"][0]: self.last_activity_date = response["items"][0][ "last_activity_date"] for post in response["items"]: if "title" not in post or "body" not in post: continue title = GlobalVars.parser.unescape(post["title"]) body = GlobalVars.parser.unescape(post["body"]) link = post["link"] post_score = post["score"] up_vote_count = post["up_vote_count"] down_vote_count = post["down_vote_count"] try: owner_name = GlobalVars.parser.unescape( post["owner"]["display_name"]) owner_link = post["owner"]["link"] owner_rep = post["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 q_id = str(post["question_id"]) is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False, owner_rep, post_score) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why, owner_rep, post_score, up_vote_count, down_vote_count) except: print "NOP" try: for answer in post["answers"]: answer_title = "" body = answer["body"] print "got answer from owner with name " + owner_name link = answer["link"] a_id = str(answer["answer_id"]) post_score = answer["score"] up_vote_count = answer["up_vote_count"] down_vote_count = answer["down_vote_count"] try: owner_name = GlobalVars.parser.unescape( answer["owner"]["display_name"]) owner_link = answer["owner"]["link"] owner_rep = answer["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 is_spam, reason, why = check_if_spam( answer_title, body, owner_name, owner_link, site, a_id, True, False, owner_rep, post_score) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why, owner_rep, post_score, up_vote_count, down_vote_count) except: print "NOP" except: print "no answers" return
def make_api_call_for_site(self, site): self.queue_modify_lock.acquire() if site not in self.queue: GlobalVars.charcoal_hq.send_message("Attempted API call to {} but there are no posts to fetch.".format(site)) return posts = self.queue.pop(site) store_bodyfetcher_queue() self.queue_modify_lock.release() question_modifier = "" pagesize_modifier = "" if site == "stackoverflow.com": # Not all SO questions are shown in the realtime feed. We now # fetch all recently modified SO questions to work around that. if self.last_activity_date != 0: pagesize = "50" else: pagesize = "25" pagesize_modifier = "&pagesize={pagesize}&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date)) else: question_modifier = "/{0}".format(";".join(str(post) for post in posts)) url = "http://api.stackexchange.com/2.2/questions{q_modifier}?site={site}&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw(({optional_min_query_param}".format(q_modifier=question_modifier, site=site, optional_min_query_param=pagesize_modifier) # wait to make sure API has/updates post data time.sleep(3) # Respect backoff, if we were given one if GlobalVars.api_backoff_time > time.time(): time.sleep(GlobalVars.api_backoff_time - time.time() + 2) try: response = requests.get(url, timeout=20).json() except requests.exceptions.Timeout: return # could add some retrying logic here, but eh. self.api_data_lock.acquire() add_or_update_api_data(site) self.api_data_lock.release() message_hq = "" if "quota_remaining" in response: if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0: GlobalVars.charcoal_hq.send_message("API quota rolled over with {0} requests remaining. Current quota: {1}.".format(GlobalVars.apiquota, response["quota_remaining"])) sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True) api_quota_used_per_site = "" for site_name, quota_used in sorted_calls_per_site: api_quota_used_per_site += site_name.replace('.com', '').replace('.stackexchange', '') + ": {0}\n".format(str(quota_used)) api_quota_used_per_site = api_quota_used_per_site.strip() GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False) clear_api_data() if response["quota_remaining"] == 0: GlobalVars.charcoal_hq.send_message("API reports no quota left! May be a glitch.") GlobalVars.charcoal_hq.send_message(str(response)) # No code format for now? if GlobalVars.apiquota == -1: GlobalVars.charcoal_hq.send_message("Restart: API quota is {quota}.".format(quota=response["quota_remaining"])) GlobalVars.apiquota = response["quota_remaining"] else: message_hq = "The quota_remaining property was not in the API response." if "error_message" in response: message_hq += " Error: {}.".format(response["error_message"]) if "backoff" in response: if GlobalVars.api_backoff_time < time.time() + response["backoff"]: GlobalVars.api_backoff_time = time.time() + response["backoff"] match = regex.compile('/2.2/([^.]*)').search(url) url_part = match.group(1) if match else url message_hq += "\nBackoff received of {} seconds on request to `{}`".format(str(response["backoff"]), url_part) if len(message_hq) > 0: GlobalVars.charcoal_hq.send_message(message_hq.strip()) if "items" not in response: return if site == "stackoverflow.com": items = response["items"] if len(items) > 0 and "last_activity_date" in items[0]: self.last_activity_date = items[0]["last_activity_date"] for post in response["items"]: if "title" not in post or "body" not in post: continue title = GlobalVars.parser.unescape(post["title"]) body = GlobalVars.parser.unescape(post["body"]) link = post["link"] post_score = post["score"] up_vote_count = post["up_vote_count"] down_vote_count = post["down_vote_count"] try: owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"]) owner_link = post["owner"]["link"] owner_rep = post["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 q_id = str(post["question_id"]) is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False, owner_rep, post_score) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why, owner_rep, post_score, up_vote_count, down_vote_count, None) except: print "NOP" try: for answer in post["answers"]: answer_title = "" body = answer["body"] print "got answer from owner with name " + owner_name link = answer["link"] a_id = str(answer["answer_id"]) post_score = answer["score"] up_vote_count = answer["up_vote_count"] down_vote_count = answer["down_vote_count"] try: owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"]) owner_link = answer["owner"]["link"] owner_rep = answer["owner"]["reputation"] except: owner_name = "" owner_link = "" owner_rep = 0 is_spam, reason, why = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False, owner_rep, post_score) if is_spam: try: handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why, owner_rep, post_score, up_vote_count, down_vote_count, q_id) except: print "NOP" except: print "no answers" return