def validate(self, apartment): logger.info('[{}] [Validator] START'.format(apartment.get('city'))) try: self.examine_single_apartment(apartment) except Exception as e: logger.error('[{}] [Validator] [validate] err'.format(apartment.get('city'))) logger.exception(e)
def task_clean(taskid): '''清理任务数据''' try: if not taskid: return False #删除任务 db.exec('delete from task where id=:id', {'id': taskid}) #根据任务ID清理任务执行 db.exec("delete from task_execute where task_id=:id", {'id': taskid}) #使用in操作,仅为测试,性能较低 taskIds = [taskid] sqlIn = "','".join(['spider_execute_%s'] * len([taskIds])) db.exec( "delete from djcelery_periodictask where name in ('%s')" % sqlIn, taskIds) #根据任务ID清理抓取到的URL db.exec("delete from spider_url where task_id=:id", {'id': taskid}) #根据任务ID清理数据处理结果 db.exec("delete from task_piping where task_id=:id", {'id': taskid}) db.exec("delete from task_piping_result where task_id=:id", {'id': taskid}) #db.exec("delete from piping_result where task_id=:id", {'id': taskid}) #根据任务ID删除mongodb数据 mgdb.remove_taskid(taskid) #根据任务ID清理文件及快照等 @todo except Exception as e: logger.exception(e) return False
def start_one_url(self, task): try: logger.info('[{}] [DetailCrawler] Start crawl new link'.format( task.get('city'))) self.get(task.get('url')) logger.info('[{}] [DetailCrawler] Url opened'.format( task.get('city'))) info = get_info_of_single_url(self.driver, task.get('url')) logger.info('[{}] [DetailCrawler] Data get'.format( task.get('city'))) mongo.insert_into_staing(task, info) except ApartmentExpiredException: logger.info('[{}] [DetailCrawler] Url expired'.format( task.get('city'))) mongo.task_expired(task) except NoSuchElementException: # probably proxy blocked logger.info('[{}] [DetailCrawler] Elm not found'.format( task.get('city'))) self.renew_driver() except (TimeoutException, WebDriverException, InvalidSessionIdException): logger.info('[{}] [DetailCrawler] Session timeout'.format( task.get('city'))) self.renew_driver() except (TooManyTimesException): pass except Exception as e: logger.exception(e) mongo.update_failure_task(task, e, self.driver.page_source) finally: self.quit()
async def on_message(self, message: discord.message): if not message.author.bot and message.guild and \ message.guild.id != RELAY_ID: # moderation system if text_filter(message.content, message.author, message.guild) or \ content_filter(message): await message.delete() else: message_chain: list = self.bucket[ message.guild.id][message.author.id] message_chain.append(message) joined = " ".join([m.content for m in message_chain]) if text_filter(joined, message.author, message.guild): for m in message_chain: try: await m.delete() except Exception: logger.exception( f"Cannot delete message {m.content}") message_chain.clear() elif len(message_chain) > int( cfg["Performance"]["moderation-buffer-limit"]): message_chain.pop(0)
def load_post_list_file(archive_file_path: str): """ Saves all posts and their comments into the database from the list of post URLs in the specified file. """ archive_file = open(archive_file_path, "r") post_list = [s.strip() for s in archive_file.readlines()] for post in post_list: try: global reddit, subreddit reddit = praw.Reddit(**config_loader.REDDIT["auth"]) subreddit = reddit.subreddit(config_loader.REDDIT["subreddit"]) reddit_submission = reddit.submission(url=post) if reddit_submission.subreddit_name_prefixed != subreddit.display_name_prefixed: logger.info( f"Post {post} is not on {subreddit.display_name_prefixed}, skipping..." ) continue save_post_and_comments(reddit_submission) except Exception: logger.exception( f"Unable to save {post}, continuing in 30 seconds...") time.sleep(30)
def task_start(taskid): try: task = db.fetchone('select * from task where id=:id', {'id': taskid}) if not task: return False startUrls = json.loads(task['start_urls']) executedata = { 'site_id': task['site_id'], 'task_id': task['id'], 'app_id': task['app_id'], 'task_type': task['type'], 'start_urls': task['start_urls'], 'domain': getDomainNoPort(startUrls[0]), 'exec_level': task['exec_level'], 'limit_depth': task['limit_depth'], 'limit_total': task['limit_total'], 'limit_time': task['limit_time'], 'limit_subdomain': task['limit_subdomain'], 'limit_image': task['limit_image'], 'limit_js': task['limit_js'], 'limit_jsevent': task['limit_jsevent'], 'exclude_urls': task['exclude_urls'], 'url_unique_mode': task['url_unique_mode'], 'notify_url': task['notify_url'], 'source_ip': task['source_ip'], 'proxies': task['proxies'], 'status': 0, } executeid = db.insert('task_execute', executedata) return executeid except Exception as e: logger.exception(e) return False
def doInit(self, model_filename=Config.Model.MARS_DIR, batch_size=1): try: self.encoder = pedestrian_extractor.create_box_encoder( model_filename, batch_size=batch_size) except: logger.exception("CUDA out off memory", exc_info=True) print(self.name, '=' * 10)
def send_webhook_message(channel_webhook_url, json_content, retries=3): """ Send a message to the specified channel via a webhook. :param channel_webhook_url: full URL for the receiving webhook :param json_content: dictionary containing data to send (usually "content" or "embed" keys) :param retries: number of times to attempt to send message again if it fails :return: True if message was successfully sent, False otherwise """ if not config_loader.DISCORD["enabled"]: return True attempt = 0 while attempt <= retries: try: response = requests.post(channel_webhook_url, json=json_content) if response.status_code in (200, 204): return True logger.warning( f"Webhook response {response.status_code}: {response.text}") except Exception: logger.exception( "Unexpected error while attempting to send webhook message.") time.sleep(5) attempt += 1 logger.error(f"Unable to send webhook message, content: {json_content}") return False
def scrape_image(image_url: str, slug: str) -> Path: if isinstance(image_url, str): # Handles String Types image_url = image_url if isinstance(image_url, list): # Handles List Types image_url = image_url[0] if isinstance(image_url, dict): # Handles Dictionary Types for key in image_url: if key == "url": image_url = image_url.get("url") filename = slug + "." + image_url.split(".")[-1] filename = IMG_DIR.joinpath(filename) try: r = requests.get(image_url, stream=True) except: logger.exception("Fatal Image Request Exception") return None if r.status_code == 200: r.raw.decode_content = True with open(filename, "wb") as f: shutil.copyfileobj(r.raw, f) return filename return None
async def on_error(event, *args, **kwargs): try: raise event except discord.HTTPException: os.system("kill 1") # hard restart on 429 except Exception: logger.exception(event)
def doInit(self): face_graph = tf_graph.FaceGraph() try: self.extractor = face_extractor.FacenetExtractor( face_graph, model_path=Config.Model.COEFF_DIR) except: logger.exception("CUDA out off memory", exc_info=True) self.preprocessor = preprocess.Preprocessor() print(self.name, '=' * 10)
def migrate_flairs(): """ Fetch all users with flairs in the subreddit and convert them to the emoji format. """ global reddit, subreddit users_to_update = [] total_users = 0 users_with_flair = 0 while True: try: logger.info("Connecting to Reddit...") reddit = praw.Reddit(**config_loader.REDDIT["auth"]) subreddit = reddit.subreddit(config_loader.REDDIT["subreddit"]) logger.info("Loading flairs...") # Generator will load in batches of 1000 from Reddit, this covers the entire sub. for user_flair in subreddit.flair(limit=None): new_flair_text, template_id = _parse_flair(user_flair) if user_flair["flair_css_class"] is not None and user_flair[ "flair_css_class"].startswith("v-"): verified_users.append(user_flair["user"].name) if new_flair_text: users_with_flair += 1 users_to_update.append({ "user": user_flair["user"], "flair_text": new_flair_text, "flair_template_id": template_id }) # Update users in a batch if len(users_to_update) >= 100: # subreddit.flair.update(flair_list=users_to_update) total_users += len(users_to_update) logger.info( f"Updated {len(users_to_update)} users, {total_users} total" ) users_to_update = [] if users_to_update: # subreddit.flair.update(flair_list=users_to_update) total_users += len(users_to_update) logger.info( f"Updated {len(users_to_update)} users, {total_users} total" ) logger.info( f"{users_with_flair} / {total_users} users successfully migrated!" ) break except Exception: delay_time = 30 logger.exception( f"Encountered an unexpected error, restarting in {delay_time} seconds..." ) time.sleep(delay_time)
def doInit(self): face_graph = tf_graph.FaceGraph() try: self.extractor = face_extractor.ArcFaceExtractor( model_path=Config.Model.ARCFACE_DIR) except: logger.exception("CUDA out off memory", exc_info=True) print(self.name, '=' * 10)
def doInit(self): face_graph = tf_graph.FaceGraph() try: self.face_detector = face_detector.MTCNNDetector( face_graph, scale_factor=Config.MTCNN.SCALE_FACTOR) except: logger.exception("CUDA device out of memory") super(FaceDetectWorker, self).__init__() self.face_count = 0 self.detected_frame_count = 0 print(self.name, '=' * 10)
def connect(self): """连接指定IP、端口""" if not self.connected: try: self._sock.connect((self.domain, self.port)) except socket.error as e: logger.exception(e) else: self.connected = 1 logger.debug('TCPClient connect to {0}:{1} success.'.format( self.domain, self.port))
def doInit(self, use_coeff_filter=True): try: self.embs_extractor = face_extractor.ArcFaceExtractor( model_path=Config.Model.ARCFACE_DIR) self.use_coeff_filter = use_coeff_filter if use_coeff_filter: coeff_graph = tf_graph.FaceGraph() self.coeff_extractor = face_extractor.FacenetExtractor( coeff_graph, model_path=Config.Model.COEFF_DIR) except: logger.exception("CUDA out off memory", exc_info=True) print(self.name, '=' * 10)
def encrypt(string, salt='', encrypt_way='MD5'): u"""根据输入的string与加密盐,按照encrypt方式进行加密,并返回加密后的字符串""" string += salt if encrypt_way.upper() == 'MD5': hash_string = hashlib.md5() elif encrypt_way.upper() == 'SHA1': hash_string = hashlib.sha1() else: logger.exception(EncryptError('请输入正确的加密方式,目前仅支持 MD5 或 SHA1')) return False hash_string.update(string.encode()) return hash_string.hexdigest()
def doInit(self): try: self.pedestrian_detector = pedestrian_detector.YOLODetector() except: logger.exception("CUDA device out of memory") super(PedestrianDetectWorker, self).__init__() roi_cordinate = Config.ROI.ROI_CORDINATE[Config.ROI.USE] self.roi_cordinate_np = self.ConvertCordinates(roi_cordinate) self.rect = cv2.boundingRect(self.roi_cordinate_np) self.roi_cordinate_np_scale = self.roi_cordinate_np - np.array([self.rect[0],self.rect[1]]) self.centroids = (int(self.rect[2]/2), int(self.rect[3]/2)) self.pedestrian_count = 0 self.detected_frame_count = 0 self.background_subtraction = background_subtraction.BGSProcess() print(self.name, '=' * 10)
def examine_single_apartment(self, apartment): try: examine_apartment(apartment) inserted_id = mongo.on_pass_validation(apartment) self.notify(inserted_id) logger.info('[{}] [Validator] pass validation'.format(apartment.get('city',None))) except ValidatorInvalidValueException as e1: logger.info('[{} ]Found invalid value'.format(apartment.get('city'))) invalid_values = e1.args[1] mongo.report_invalid_value(apartment, invalid_values) except Exception as e: logger.error('[{}] [Validator] [examine_single_apartment] err'.format(apartment.get('city'))) logger.exception(e) mongo.report_unexpected_error('data_validator', e, apartment.get( 'house_url') if apartment else None)
def ydfs_upload(key, localfile): try: cmd = '%s/dfs_client -action upload -config %s/client.json --filekey %s --file=%s' % ( PATH_GO, PATH_GO, key, localfile) child = Popen(cmd, shell=True, close_fds=True, bufsize=-1, stdout=PIPE, stderr=STDOUT) output = child.stdout.read().decode() #remove(filename) return output except Exception as e: logger.exception(e) return False
def execute_register(patient: dict) -> None: try: driver = set_chrome_driver() driver.get(SEARCH_URL) search_bar = driver.find_element( By.CSS_SELECTOR, 'input[name="ctl00$ContentPlaceHolder1$tbxDt"]') search_bar.send_keys(patient['doc_name']) search_button = driver.find_element( By.CSS_SELECTOR, 'input[name="ctl00$ContentPlaceHolder1$btnDtQuery"]') search_button.click() time.sleep(1) # get date table schedule = driver.find_elements( By.CSS_SELECTOR, 'table[id="ctl00_ContentPlaceHolder1_gvDtQuery"] > tbody > tr') # get page list --> start at last page found_date = False page_list = schedule[-1].find_elements(By.CSS_SELECTOR, 'a') page_list[-1].click() time.sleep(1) rows = driver.find_elements( By.CSS_SELECTOR, 'table[id="ctl00_ContentPlaceHolder1_gvDtQuery"] > tbody > tr') for j in range(2, len(rows) - 1): values = rows[j].find_elements(By.CSS_SELECTOR, 'td') date = values[1].text if date == patient['target_date']: found_date = True values[0].click() break if not found_date: print('unable to find the target date, 指定日期不存在') logger.error('unable to find the target date, 指定日期不存在') id_bar = driver.find_element(By.CSS_SELECTOR, 'input[name="txtMRNo"]') id_bar.send_keys(patient['id']) time.sleep(300) return except Exception as e: print(f'execute {patient} failed, {e}') logger.exception(f'execute {patient} failed, {e}') return
def on_open_station(self, station_info): ''' get apartments count ''' try: priority = get_num_of_apartment(self.driver) mongo.update_priority_of_station(station_info.get('_id'), priority) except NoSuchElementException: logger.info( '[{}] [UrlCrawler] Unable to get apartment count'.format( self.city)) raise except Exception as e: logger.error('[{}] [UrlCrawler] [on_open_station] err'.format( self.city)) logger.exception(e) mongo.report_unexpected_error_url_crawler(e)
def monitor_stream(): """ Monitor the subreddit for new actions and parse them when they come in. Will restart upon encountering an error. """ global reddit, subreddit while True: try: logger.info("Connecting to Reddit...") reddit = praw.Reddit(**config_loader.REDDIT["auth"]) subreddit = reddit.subreddit(config_loader.REDDIT["subreddit"]) _get_moderators() logger.info("Starting mod log stream...") for mod_action in subreddit.mod.stream.log(): parse_mod_action(mod_action) except Exception: delay_time = 30 logger.exception(f"Encountered an unexpected error, restarting in {delay_time} seconds...") time.sleep(delay_time)
def monitor_stream(): """ Monitor the subreddit for new comments and parse them when they come in. Will restart upon encountering an error. """ global reddit, subreddit while True: try: logger.info("Connecting to Reddit...") reddit = praw.Reddit(**config_loader.REDDIT["auth"]) subreddit = reddit.subreddit(config_loader.REDDIT["subreddit"]) logger.info("Starting comment stream...") for comment in subreddit.stream.comments(skip_existing=False): process_comment(comment) except Exception: delay_time = 30 logger.exception( f"Encountered an unexpected error, restarting in {delay_time} seconds..." ) time.sleep(delay_time)
def start_fill_missing(self, apartment): ''' fill in missing info ''' try: logger.info( '[{}] [DetailCrawler] Start fill in missing info'.format( apartment.get('city'))) self.get(apartment.get('house_url')) logger.info('[{}] [DetailCrawler] Url opened'.format( apartment.get('city'))) info = get_info_of_single_url(self.driver, apartment.get('house_url')) logger.info('[{}] [DetailCrawler] Data get'.format( apartment.get('city'))) mongo.update_missing_info(apartment, info) sleep(2) except ApartmentExpiredException: logger.info('[{}] [DetailCrawler] Url expired'.format( apartment.get('city'))) mongo.update_missing_info(apartment, { 'expired': True, }) except NoSuchElementException: logger.info('[{}] [DetailCrawler] Elm not found'.format( apartment.get('city'))) except (TimeoutException, WebDriverException, InvalidSessionIdException): logger.info('[{}] [DetailCrawler] Session timeout'.format( apartment.get('city'))) self.renew_driver() except (TooManyTimesException): pass except Exception as e: logger.error( '[{}] [DetailCrawler] [start_fill_missing] err'.format( apartment.get('city'))) logger.exception(e) finally: self.quit()
def main(): current_offset = 0 while True: processed_posts = migrate_posts(current_offset) current_offset += processed_posts if processed_posts < 1000: break if current_offset % 1000 == 0: logger.info(f"Migrated {current_offset} posts total") current_datetime = datetime.fromisoformat("2020-05-12 04:00:00.000") now = datetime.utcnow() while current_datetime <= now: try: migrate_snapshots(current_datetime.date(), current_datetime.hour) except Exception: logger.exception( f"Failed to migrate {current_datetime.date()} - {current_datetime.hour}" ) current_datetime += timedelta(hours=1) if current_datetime.hour == 0: logger.info(f"Finished migrating {current_datetime.date()}")
def send(self, data, dtype='str', suffix=''): """向服务器端发送send_string,并返回信息,若报错,则返回None""" if dtype == 'json': send_string = json.dumps(data) + suffix else: send_string = data + suffix self.connect() if self.connected: try: self._sock.send(send_string.encode()) logger.debug('TCPClient Send {0}'.format(send_string)) except socket.error as e: logger.exception(e) try: rec = self._sock.recv(self.max_receive).decode() if suffix: rec = rec[:-len(suffix)] logger.debug('TCPClient received {0}'.format(rec)) return rec except socket.error as e: logger.exception(e)
def save_post_and_comments(reddit_submission: Submission): """ Saves a single reddit post and its comments to the database. """ post_name = reddit_submission.permalink # Ensure post is in the database first. post_service.add_post(reddit_submission) logger.info( f"Loading {reddit_submission.num_comments} comments on {post_name}") # Load all comments retry_count = 0 while True: try: retry_count += 1 if retry_count > 3: logger.info(f"Unable to load all comments for {post_name}") return reddit_submission.comments.replace_more(limit=None) break except Exception: logger.exception("Handling replace_more exception") time.sleep(5) logger.info(f"Processing comments on {post_name}") index = -1 for index, reddit_comment in enumerate(reddit_submission.comments.list()): # Since all comments will reference a parent if it exists, add all parent comments first. logger.debug(f"Saving parent comments of {reddit_comment.id}") comment_service.add_comment_parent_tree(reddit, reddit_comment) logger.debug(f"Saving comment {reddit_comment.id}") comment_service.add_comment(reddit_comment) if (index + 1) % 500 == 0: logger.info(f"Completed {index + 1} comments on {post_name}") logger.info(f"Finished processing {post_name}, total {index + 1} comments")
async def on_command_error(ctx: commands.Context, exception): # When a command fails to execute await ctx.send(f"Error: {exception}", reference=ctx.message) logger.exception("Command Error", exc_info=exception)
async def announcement_error(self): logger.exception("Announcement Error")