def _parseDataDetail(self, url): content = '' status_code, response = self.get(url=url, get_params={}) if status_code == 200: system_log.debug('{} runCrawl success [{}] {}'.format( self._website, status_code, url)) try: soup_detail = BeautifulSoup(response, 'lxml') content = '<br />'.join([ x.get_text(separator=' ', strip=True).strip() for x in soup_detail.find(name='article').find( name='section').find_all(name='p') ]) except Exception as ex: pass else: system_log.error('{} runCrawl failed [{}] {}'.format( self._website, status_code, url)) return content
def _run(self, essence_level=4, limit=16, after=0): url = self._url post_data = { 'essence_level': essence_level, 'limit': limit, 'source': 'pc', 'token': '', 'timestamp': int(time.time() * 1000), } if after > 0: post_data['after'] = after status_code, response = self.post(url=url, post_data=post_data, headers=self._headers) if status_code == 200: system_log.debug('{} runCrawl success [{}] {}'.format( self._website, status_code, url)) return self.parseData(response, next=True) else: system_log.error('{} runCrawl failed [{}] {}'.format( self._website, status_code, url))
def runNotify(self, head_kws, title, content, origin='', jump_url='', news_time=0, **kw): now_datetime = datetime.datetime.now() if now_datetime.weekday() >= 5: return notify_start = 9 notify_end = 15 try: notify_start = env.env_conf['trigger_notify_period']['value'][0] notify_end = env.env_conf['trigger_notify_period']['value'][1] except Exception as ex: system_log.error( "get env_conf trigger_notify_period failed {}".format( env.env_conf)) pass if int(now_datetime.hour) < notify_start or int( now_datetime.hour) > notify_end: return now_time = time.time() expired_seconds = 3600 try: if int(env.env_conf['trigger_msg_expired']['value']) > 0: expired_seconds = int( env.env_conf['trigger_msg_expired']['value']) except Exception as ex: system_log.error( "get env_conf trigger_msg_expired failed {}".format( env.env_conf)) pass if (time.time() - int(news_time)) > expired_seconds: system_log.debug( 'runNotify msg droped news_time:{} now_time:{}'.format( news_time, now_time)) return False timestr = datetime.datetime.fromtimestamp(news_time).strftime( '%m-%d %H:%M:%S') for _, notifier in self._notifiers.items(): notifier.notify(head_kws=head_kws, title=title, content=content, origin=origin, jump_url=jump_url, timestr=timestr)
def mainEnvWorker(): rs = item_data_store.getRunningStatus() if 'run_switch' in rs and int(rs['run_switch']) == -1: return False try: env.setWebsites(item_data_store.getWebsites()) env.setEventKeywords(item_data_store.getEventKeywords()) except Exception as ex: system_log.error('refresh env error: {} {}'.format( ex, str(traceback.format_exc()))) #raise system_log.debug('env crawler status {}:'.format(env.crawler_status)) if env.proxies is None or int(time.time()) >= env.proxies['expire_time']: env.setProxies(fetchProxiesUrl()) data = [ ['start_time', int(env.start_time)], ['env', json.dumps(env.env_conf)], ['event_keywords', json.dumps(env.event_keywords)], ['websites', json.dumps(env.websites)], ['crawler_status', json.dumps(env.crawler_status)], ] item_data_store.saveRunningStatus(data) return True
def cleanLog(): '''清理历史日志数据''' try: data_keep_days = int(env.env_conf['data_keep_days']['value']) system_log.info('start clean log over {} days'.format(data_keep_days)) expire_time = datetime.date.today() + datetime.timedelta( days=-data_keep_days) expire_time = int(time.mktime(expire_time.timetuple())) for root, dirs, files in os.walk(env.log_dir): #print(root,dirs,files) for file in files: #获取文件所属目录 #print(root) #获取文件路径 if not re.fullmatch('.*\.log.*', file, flags=0): continue file_path = os.path.join(root, file) file_update_time = os.path.getmtime(file_path) #print(file_path, file_update_time) if file_update_time < expire_time: system_log.debug('remove log file: {}'.format(file_path)) os.remove(file_path) system_log.info('clean log finished '.format(data_keep_days)) except Exception as ex: system_log.error('clean log error: {} {}'.format( ex, str(traceback.format_exc()))) raise
def _run(self, url): status_code, response = self.post(url=url, post_data={}) if status_code == 200: system_log.debug('{} runCrawl success [{}] {}'.format(self._website, status_code, url)) self.parseData(response) else: system_log.error('{} runCrawl failed [{}] {}'.format(self._website, status_code, url))
def _run(self, page): url = self._url.format(page) status_code, response = self.get(url=url, get_params={}) if status_code == 200: system_log.debug('{} runCrawl success [{}] {}'.format(self._website, status_code, url)) self.parseData(response) else: system_log.error('{} runCrawl failed [{}] {}'.format(self._website, status_code, url))
def _run(self, page): time_str = str(int(time.time() * 1000)) url = self._url.format(time_str, page, self._pagesize, time_str) status_code, response = self.get(url=url, get_params={}) if status_code == 200: system_log.debug('{} runCrawl success [{}] {}'.format( self._website, status_code, url)) self.parseData(response) else: system_log.error('{} runCrawl failed [{}] {}'.format( self._website, status_code, url))
def threadMonitorWorker(): system_log.info('monitor线程[{}] 启动'.format(threading.current_thread().name)) while True: try: try: msg = env.monitor_task_queue.get() system_log.debug('[' + threading.current_thread().name + ']: {}'.format(msg)) #time.sleep(1) except Exception as ex: system_log.error('get monitor task queue error:{}'.format(ex)) #raise finally: pass #env.monitor_task_queue.task_done() except Exception as ex: system_log.error( 'operation monitor task queue error:{}'.format(ex))
def _parseDataDetail(self, url): timestr = '' content = '' status_code, response = self.get(url=url, get_params={}) if status_code == 200: system_log.debug('{} runCrawl success [{}] {}'.format(self._website, status_code, url)) try: soup_detail = BeautifulSoup(response , 'lxml') content = soup_detail.find(class_='news_txt').get_text(separator='<br />', strip=True).strip() timestr = list(soup_detail.find(class_='news_about').find_all(name='p')[1].stripped_strings)[0].strip() except Exception as ex: pass else: system_log.error('{} runCrawl failed [{}] {}'.format(self._website, status_code, url)) return timestr, content
def _parseDataDetail(self, url): timestr = None content = '' status_code, response = self.get(url=url, get_params={}, headers=self._headers_detail) if status_code == 200: system_log.debug('{} runCrawl success [{}] {}'.format(self._website, status_code, url)) try: soup_detail = BeautifulSoup(response , 'lxml') content = '<br />'.join([x.get_text(separator=' ', strip=True).strip() for x in soup_detail.find(class_='pages_content').find_all(name='p')]) timestr = list(soup_detail.find(class_='pages-date').stripped_strings)[0].strip() except Exception as ex: pass else: system_log.error('{} runCrawl failed [{}] {}'.format(self._website, status_code, url)) return timestr, content
def _run(self, page): url = self._url post_data = { 'pageNo': page, 'pageSize': self._pagesize, 'searchTypes': '11,', 'market': '', 'industry': '', 'stockCode': '', } status_code, response = self.post(url=url, post_data=post_data) if status_code == 200: system_log.debug('{} runCrawl success [{}] {}'.format( self._website, status_code, url)) self.parseData(response) else: system_log.error('{} runCrawl failed [{}] {}'.format( self._website, status_code, url))
def fetchProxiesUrl(): fanqie_ip_api_url = "http://x.fanqieip.com/ggg?getType=4&qty=1&port=1&time=5&city=310100&format=2&ss=1%2C2%2C3%2C4&dt=1&css=" # 上海 "http://x.fanqieip.com/ggg?getType=4&qty=1&port=1&time=5&city=310100&format=2&ss=1%2C2%2C3%2C4&dt=1&css=" # 全国 "http://x.fanqieip.com/ggg?getType=4&qty=1&port=1&time=5&city=0&format=2&ss=1%2C2%2C3%2C4&dt=1&css=" rtn = None try: session = requests.Session() response = session.get(fanqie_ip_api_url, timeout=10) response.encoding = response.apparent_encoding tmp = json.loads(response.text) if "code" in tmp and tmp['code'] == 0 and "data" in tmp: q_ip = tmp['data'][0]['ip'] q_port = tmp['data'][0]['port'] #q_expire_time = int(time.time())+30 #int(time.mktime(time.strptime(tmp['data'][0]['expire_time'], '%Y-%m-%d %H:%M:%S'))) - 20 q_expire_time = int( time.mktime( time.strptime(tmp['data'][0]['expire_time'], '%Y-%m-%d %H:%M:%S'))) - 20 rtn = { 'proxies_url': "http://" + str(q_ip) + ":" + str(q_port), 'expire_time': q_expire_time } system_log.debug('fetchProxiesUrl [{}] success: {}'.format( fanqie_ip_api_url, rtn)) except Exception as ex: system_log.error('fetchProxiesUrl [{}] failed: {}'.format( fanqie_ip_api_url, ex)) return None return rtn
def threadNotifyWorker(): system_log.info('notify线程[{}] 启动'.format(threading.current_thread().name)) while True: try: try: msg = env.notify_task_queue.get(timeout=1) system_log.debug('[' + threading.current_thread().name + ']: {}'.format(msg)) msg = json.loads(msg) event_notify.runNotify(**msg) #time.sleep(1) except queue.Empty as ex: continue except Exception as ex: system_log.error('get notify task queue error: {} {}'.format( ex, str(traceback.format_exc()))) #raise finally: pass #env.notify_task_queue.task_done() except Exception as ex: system_log.error('operation notify task queue error: {} {}'.format( ex, str(traceback.format_exc()))) #raise monitor_msg = { 'type': 'alive', 'thread_type': 'notifier', 'thread_name': threading.current_thread().name, 'time': time.time(), } env.monitor_task_queue.put(json.dumps(monitor_msg))
def _run(self, page): url = self._url.format(self._pagesize, page, str(int(time.time() * 1000))) proxies = None if env.proxies is not None: proxies = { "http": env.proxies['proxies_url'], "https": env.proxies['proxies_url'], } status_code, response = self.get(url=url, get_params={}, proxies=proxies) if status_code == 200: system_log.debug( '{} runCrawl success [{}] {} use proxy: {}'.format( self._website, status_code, url, proxies)) self.parseData(response) else: system_log.error('{} runCrawl failed [{}] {} use proxy: {}'.format( self._website, status_code, url, proxies))
def threadCrawlWorker(**kw): system_log.info('crawl线程[{}] 启动'.format(threading.current_thread().name)) crawlers = {} for crawl_website, crawl_website_conf in kw.items(): freq = crawl_website_conf['freq'] trigger = crawl_website_conf['trigger'] trigger_part = crawl_website_conf['trigger_part'] if freq is None or len(freq) <= 0: freq = [10, 15] try: if len(env.env_conf['default_crawl_freq']['value']) > 0: freq = env.env_conf['default_crawl_freq']['value'] except Exception as ex: system_log.error( "get env_conf default_crawl_freq failed {}".format( env.env_conf)) pass env.registCrawler(crawl_website, freq=freq, trigger=trigger, trigger_part=trigger_part) crawlers[crawl_website] = eval(crawl_website_conf['class'])() while True: for crawl_website, cls in crawlers.items(): if env.crawler_status[crawl_website]['last_run'] <= 0: system_log.debug('{} run last_run < 0'.format(crawl_website)) pass else: if time.time() - env.crawler_status[crawl_website][ 'last_run'] <= env.crawler_status[crawl_website][ 'freq'][0]: system_log.debug('{} run to fast'.format(crawl_website)) continue try: cls.run() env.crawler_status[crawl_website]['last_run'] = int( time.time()) env.crawler_status[crawl_website][ 'run_counts'] = env.crawler_status[crawl_website][ 'run_counts'] + 1 except Exception as ex: system_log.error('crawl run error: {} {}'.format( ex, str(traceback.format_exc()))) continue time.sleep(random.randint(1, 3)) monitor_msg = { 'type': 'alive', 'thread_type': 'crawlers', 'thread_name': threading.current_thread().name, 'time': time.time(), } env.monitor_task_queue.put(json.dumps(monitor_msg))
def notify(self, head_kws=[], title='', content='', origin='', jump_url='', timestr='', send_count=1): timestamp, sign = self._generateSign() get_params = { 'access_token': self._access_token, 'timestamp': timestamp, 'sign': sign, } url = self._url + '?' + urlencode(get_params) if origin is not None and origin != '': origin = '【' + origin + '】' else: origin = '点击查看' text = "### {}\n > **{}**".format(title, content) if timestr is not None and timestr != "": text = text + "\n\n{} ".format(timestr) else: text = text + "\n\n " if jump_url is not None and jump_url != "": text = text + "[{}]({})".format(origin, jump_url) head_title = title if len(head_kws) > 0: head_title = '【' + ','.join(head_kws) + '】相关消息' text = "## " + head_title + "\n" + text data = { "msgtype": "markdown", "markdown": { "title": head_title, "text": text, }, "at": { "atMobiles": [], "isAtAll": False } } response = requests.post(url=url, data=json.dumps(data), headers={'Content-Type': 'application/json'}) response.encoding = response.apparent_encoding if response.status_code == 200: res = json.loads(response.text) if res['errcode'] == 0: system_log.debug('dingding notify success {}'.format(title)) return True else: if res['errcode'] == 130101 and send_count <= 8: time.sleep(10) return self.notify(head_kws=head_kws, title=title, content=content, origin=origin, jump_url=jump_url, timestr=timestr, send_count=send_count + 1) system_log.warning('dingding notify failed [{}] {}'.format( res['errcode'], res['errmsg'])) return False system_log.debug('dingding notify failed {}'.format( response.status_code)) return False
def threadTriggerWorker(): system_log.info('trigger线程[{}] 启动'.format(threading.current_thread().name)) item_data_store2 = ItemDataStore() while True: try: try: msg = env.trigger_task_queue.get(timeout=1) system_log.debug('[' + threading.current_thread().name + ']: {}'.format(msg)) msg_data = json.loads(msg) website = msg_data['website'] if env.crawler_status[website]['trigger']: pass else: continue #website, pid, title, content, jumpurl, news_time, create_time = json.loads(msg) trigger_flag = False if len(env.crawler_status[website]['trigger_part']) > 0: tp = [ msg_data[x] for x in env.crawler_status[website]['trigger_part'] ] head_kws = event_trigger.runTrigger(*tp, website=website) if len(head_kws) > 0: trigger_flag = True else: head_kws = [] trigger_flag = True if trigger_flag: origin = website if website in env.websites.keys(): origin = env.websites[website]['website_name'] notify_msg = { 'head_kws': head_kws, 'website': website, 'pid': msg_data['pid'], 'title': msg_data['title'], 'content': msg_data['content'], 'origin': origin, 'jump_url': msg_data['url'], 'news_time': msg_data['news_time'], } env.notify_task_queue.put(json.dumps(notify_msg)) #columns = ['website','pid', 'trigger_words', 'title','content', 'origin', 'jump_url','news_time','create_time'] item_data_store2.saveTriggerMsg([ notify_msg['website'], notify_msg['pid'], ','.join(notify_msg['head_kws']), notify_msg['title'], notify_msg['content'], notify_msg['origin'], notify_msg['jump_url'], notify_msg['news_time'], int(time.time()) ]) #time.sleep(1) except queue.Empty as ex: continue except Exception as ex: system_log.error('get trigger task queue error: {} {}'.format( ex, str(traceback.format_exc()))) #raise finally: pass #env.trigger_task_queue.task_done() except Exception as ex: system_log.error( 'operation trigger task queue error: {} {}'.format( ex, str(traceback.format_exc()))) #raise monitor_msg = { 'type': 'alive', 'thread_type': 'trigger', 'thread_name': threading.current_thread().name, 'time': time.time(), } env.monitor_task_queue.put(json.dumps(monitor_msg))