예제 #1
0
파일: crawl_cs.py 프로젝트: arcsin4/spear
    def _parseDataDetail(self, url):

        content = ''
        status_code, response = self.get(url=url, get_params={})

        if status_code == 200:
            system_log.debug('{} runCrawl success [{}] {}'.format(
                self._website, status_code, url))

            try:
                soup_detail = BeautifulSoup(response, 'lxml')

                content = '<br />'.join([
                    x.get_text(separator=' ', strip=True).strip()
                    for x in soup_detail.find(name='article').find(
                        name='section').find_all(name='p')
                ])

            except Exception as ex:
                pass
        else:
            system_log.error('{} runCrawl failed [{}] {}'.format(
                self._website, status_code, url))

        return content
예제 #2
0
    def _run(self, essence_level=4, limit=16, after=0):
        url = self._url

        post_data = {
            'essence_level': essence_level,
            'limit': limit,
            'source': 'pc',
            'token': '',
            'timestamp': int(time.time() * 1000),
        }

        if after > 0:
            post_data['after'] = after

        status_code, response = self.post(url=url,
                                          post_data=post_data,
                                          headers=self._headers)

        if status_code == 200:
            system_log.debug('{} runCrawl success [{}] {}'.format(
                self._website, status_code, url))

            return self.parseData(response, next=True)
        else:
            system_log.error('{} runCrawl failed [{}] {}'.format(
                self._website, status_code, url))
예제 #3
0
    def runNotify(self,
                  head_kws,
                  title,
                  content,
                  origin='',
                  jump_url='',
                  news_time=0,
                  **kw):

        now_datetime = datetime.datetime.now()

        if now_datetime.weekday() >= 5:
            return

        notify_start = 9
        notify_end = 15

        try:
            notify_start = env.env_conf['trigger_notify_period']['value'][0]
            notify_end = env.env_conf['trigger_notify_period']['value'][1]
        except Exception as ex:
            system_log.error(
                "get env_conf trigger_notify_period failed {}".format(
                    env.env_conf))
            pass

        if int(now_datetime.hour) < notify_start or int(
                now_datetime.hour) > notify_end:
            return

        now_time = time.time()

        expired_seconds = 3600

        try:
            if int(env.env_conf['trigger_msg_expired']['value']) > 0:
                expired_seconds = int(
                    env.env_conf['trigger_msg_expired']['value'])
        except Exception as ex:
            system_log.error(
                "get env_conf trigger_msg_expired failed {}".format(
                    env.env_conf))
            pass

        if (time.time() - int(news_time)) > expired_seconds:
            system_log.debug(
                'runNotify msg droped news_time:{} now_time:{}'.format(
                    news_time, now_time))
            return False

        timestr = datetime.datetime.fromtimestamp(news_time).strftime(
            '%m-%d %H:%M:%S')

        for _, notifier in self._notifiers.items():
            notifier.notify(head_kws=head_kws,
                            title=title,
                            content=content,
                            origin=origin,
                            jump_url=jump_url,
                            timestr=timestr)
예제 #4
0
파일: app.py 프로젝트: arcsin4/spear
def mainEnvWorker():

    rs = item_data_store.getRunningStatus()
    if 'run_switch' in rs and int(rs['run_switch']) == -1:
        return False

    try:
        env.setWebsites(item_data_store.getWebsites())
        env.setEventKeywords(item_data_store.getEventKeywords())
    except Exception as ex:
        system_log.error('refresh env error: {} {}'.format(
            ex, str(traceback.format_exc())))
        #raise

    system_log.debug('env crawler status {}:'.format(env.crawler_status))

    if env.proxies is None or int(time.time()) >= env.proxies['expire_time']:
        env.setProxies(fetchProxiesUrl())

    data = [
        ['start_time', int(env.start_time)],
        ['env', json.dumps(env.env_conf)],
        ['event_keywords', json.dumps(env.event_keywords)],
        ['websites', json.dumps(env.websites)],
        ['crawler_status', json.dumps(env.crawler_status)],
    ]

    item_data_store.saveRunningStatus(data)

    return True
예제 #5
0
def cleanLog():
    '''清理历史日志数据'''

    try:
        data_keep_days = int(env.env_conf['data_keep_days']['value'])
        system_log.info('start clean log over {} days'.format(data_keep_days))

        expire_time = datetime.date.today() + datetime.timedelta(
            days=-data_keep_days)
        expire_time = int(time.mktime(expire_time.timetuple()))

        for root, dirs, files in os.walk(env.log_dir):
            #print(root,dirs,files)
            for file in files:
                #获取文件所属目录
                #print(root)
                #获取文件路径

                if not re.fullmatch('.*\.log.*', file, flags=0):
                    continue

                file_path = os.path.join(root, file)
                file_update_time = os.path.getmtime(file_path)

                #print(file_path,  file_update_time)
                if file_update_time < expire_time:
                    system_log.debug('remove log file: {}'.format(file_path))
                    os.remove(file_path)

        system_log.info('clean log finished '.format(data_keep_days))

    except Exception as ex:
        system_log.error('clean log error: {} {}'.format(
            ex, str(traceback.format_exc())))
        raise
예제 #6
0
    def _run(self, url):

        status_code, response = self.post(url=url, post_data={})

        if status_code == 200:
            system_log.debug('{} runCrawl success [{}] {}'.format(self._website, status_code, url))

            self.parseData(response)
        else:
            system_log.error('{} runCrawl failed [{}] {}'.format(self._website, status_code, url))
예제 #7
0
파일: crawl_gov.py 프로젝트: arcsin4/spear
    def _run(self, page):
        url = self._url.format(page)

        status_code, response = self.get(url=url, get_params={})

        if status_code == 200:
            system_log.debug('{} runCrawl success [{}] {}'.format(self._website, status_code, url))

            self.parseData(response)
        else:
            system_log.error('{} runCrawl failed [{}] {}'.format(self._website, status_code, url))
예제 #8
0
    def _run(self, page):
        time_str = str(int(time.time() * 1000))
        url = self._url.format(time_str, page, self._pagesize, time_str)

        status_code, response = self.get(url=url, get_params={})

        if status_code == 200:
            system_log.debug('{} runCrawl success [{}] {}'.format(
                self._website, status_code, url))

            self.parseData(response)
        else:
            system_log.error('{} runCrawl failed [{}] {}'.format(
                self._website, status_code, url))
예제 #9
0
파일: app.py 프로젝트: arcsin4/spear
def threadMonitorWorker():
    system_log.info('monitor线程[{}] 启动'.format(threading.current_thread().name))

    while True:
        try:
            try:
                msg = env.monitor_task_queue.get()
                system_log.debug('[' + threading.current_thread().name +
                                 ']: {}'.format(msg))
                #time.sleep(1)
            except Exception as ex:
                system_log.error('get monitor task queue error:{}'.format(ex))
                #raise
            finally:
                pass
                #env.monitor_task_queue.task_done()
        except Exception as ex:
            system_log.error(
                'operation monitor task queue error:{}'.format(ex))
예제 #10
0
def cleanData():
    '''清理历史数据'''

    try:
        data_keep_days = int(env.env_conf['data_keep_days']['value'])
        system_log.info('start clean data over {} days'.format(data_keep_days))

        expire_time = datetime.date.today() + datetime.timedelta(
            days=-data_keep_days)
        expire_time = int(time.mktime(expire_time.timetuple()))

        item_data_store.cleanData(expire_time=expire_time)

        system_log.info('clean data finished '.format(data_keep_days))

    except Exception as ex:
        system_log.error('clean data error: {} {}'.format(
            ex, str(traceback.format_exc())))
        raise
예제 #11
0
    def _parseDataDetail(self, url):

        timestr = ''
        content = ''
        status_code, response = self.get(url=url, get_params={})

        if status_code == 200:
            system_log.debug('{} runCrawl success [{}] {}'.format(self._website, status_code, url))

            try:
                soup_detail = BeautifulSoup(response , 'lxml')

                content = soup_detail.find(class_='news_txt').get_text(separator='<br />', strip=True).strip()

                timestr = list(soup_detail.find(class_='news_about').find_all(name='p')[1].stripped_strings)[0].strip()

            except Exception as ex:
                pass
        else:
            system_log.error('{} runCrawl failed [{}] {}'.format(self._website, status_code, url))

        return timestr, content
예제 #12
0
파일: crawl_gov.py 프로젝트: arcsin4/spear
    def _parseDataDetail(self, url):

        timestr = None
        content = ''
        status_code, response = self.get(url=url, get_params={}, headers=self._headers_detail)

        if status_code == 200:
            system_log.debug('{} runCrawl success [{}] {}'.format(self._website, status_code, url))

            try:
                soup_detail = BeautifulSoup(response , 'lxml')

                content = '<br />'.join([x.get_text(separator=' ', strip=True).strip() for x in soup_detail.find(class_='pages_content').find_all(name='p')])

                timestr = list(soup_detail.find(class_='pages-date').stripped_strings)[0].strip()

            except Exception as ex:
                pass
        else:
            system_log.error('{} runCrawl failed [{}] {}'.format(self._website, status_code, url))

        return timestr, content
예제 #13
0
    def _run(self, page):
        url = self._url

        post_data = {
            'pageNo': page,
            'pageSize': self._pagesize,
            'searchTypes': '11,',
            'market': '',
            'industry': '',
            'stockCode': '',
        }

        status_code, response = self.post(url=url, post_data=post_data)

        if status_code == 200:
            system_log.debug('{} runCrawl success [{}] {}'.format(
                self._website, status_code, url))

            self.parseData(response)
        else:
            system_log.error('{} runCrawl failed [{}] {}'.format(
                self._website, status_code, url))
예제 #14
0
파일: app.py 프로젝트: arcsin4/spear
def fetchProxiesUrl():
    fanqie_ip_api_url = "http://x.fanqieip.com/ggg?getType=4&qty=1&port=1&time=5&city=310100&format=2&ss=1%2C2%2C3%2C4&dt=1&css="
    # 上海 "http://x.fanqieip.com/ggg?getType=4&qty=1&port=1&time=5&city=310100&format=2&ss=1%2C2%2C3%2C4&dt=1&css="
    # 全国 "http://x.fanqieip.com/ggg?getType=4&qty=1&port=1&time=5&city=0&format=2&ss=1%2C2%2C3%2C4&dt=1&css="

    rtn = None

    try:
        session = requests.Session()
        response = session.get(fanqie_ip_api_url, timeout=10)

        response.encoding = response.apparent_encoding

        tmp = json.loads(response.text)

        if "code" in tmp and tmp['code'] == 0 and "data" in tmp:
            q_ip = tmp['data'][0]['ip']
            q_port = tmp['data'][0]['port']
            #q_expire_time = int(time.time())+30 #int(time.mktime(time.strptime(tmp['data'][0]['expire_time'], '%Y-%m-%d %H:%M:%S'))) - 20
            q_expire_time = int(
                time.mktime(
                    time.strptime(tmp['data'][0]['expire_time'],
                                  '%Y-%m-%d %H:%M:%S'))) - 20

            rtn = {
                'proxies_url': "http://" + str(q_ip) + ":" + str(q_port),
                'expire_time': q_expire_time
            }
            system_log.debug('fetchProxiesUrl [{}] success: {}'.format(
                fanqie_ip_api_url, rtn))

    except Exception as ex:
        system_log.error('fetchProxiesUrl [{}] failed: {}'.format(
            fanqie_ip_api_url, ex))

        return None

    return rtn
예제 #15
0
    def post(self, url, post_data, headers=None, proxies=None):
        if headers is None:
            headers = self._headers

        try:
            response = self._session.post(url,
                                          data=post_data,
                                          headers=headers,
                                          proxies=proxies,
                                          timeout=self._timeout)
        except Exception as ex:
            system_log.error('{} post [{}] failed: {}'.format(
                self._website, url, ex))

            return None, None

        response.encoding = response.apparent_encoding

        #print(response)
        if response.status_code == 200:
            return response.status_code, response.text

        return response.status_code, None
예제 #16
0
파일: app.py 프로젝트: arcsin4/spear
def threadNotifyWorker():
    system_log.info('notify线程[{}] 启动'.format(threading.current_thread().name))

    while True:
        try:
            try:
                msg = env.notify_task_queue.get(timeout=1)
                system_log.debug('[' + threading.current_thread().name +
                                 ']: {}'.format(msg))

                msg = json.loads(msg)

                event_notify.runNotify(**msg)

                #time.sleep(1)
            except queue.Empty as ex:
                continue
            except Exception as ex:
                system_log.error('get notify task queue error: {} {}'.format(
                    ex, str(traceback.format_exc())))
                #raise
            finally:
                pass
                #env.notify_task_queue.task_done()
        except Exception as ex:
            system_log.error('operation notify task queue error: {} {}'.format(
                ex, str(traceback.format_exc())))
            #raise

        monitor_msg = {
            'type': 'alive',
            'thread_type': 'notifier',
            'thread_name': threading.current_thread().name,
            'time': time.time(),
        }
        env.monitor_task_queue.put(json.dumps(monitor_msg))
예제 #17
0
    def _run(self, page):
        url = self._url.format(self._pagesize, page,
                               str(int(time.time() * 1000)))

        proxies = None
        if env.proxies is not None:
            proxies = {
                "http": env.proxies['proxies_url'],
                "https": env.proxies['proxies_url'],
            }

        status_code, response = self.get(url=url,
                                         get_params={},
                                         proxies=proxies)

        if status_code == 200:
            system_log.debug(
                '{} runCrawl success [{}] {} use proxy: {}'.format(
                    self._website, status_code, url, proxies))

            self.parseData(response)
        else:
            system_log.error('{} runCrawl failed [{}] {} use proxy: {}'.format(
                self._website, status_code, url, proxies))
예제 #18
0
    def get(self, url, get_params={}, headers=None, proxies=None):
        if headers is None:
            headers = self._headers

        params = urlencode(get_params)
        url = url + '?' + params

        try:
            response = self._session.get(url,
                                         headers=headers,
                                         proxies=proxies,
                                         timeout=self._timeout)
        except Exception as ex:
            system_log.error('{} get [{}] failed: {}'.format(
                self._website, url, ex))

            return None, None

        response.encoding = response.apparent_encoding

        if response.status_code == 200:
            return response.status_code, response.text

        return response.status_code, None
예제 #19
0
파일: app.py 프로젝트: arcsin4/spear
def threadCrawlWorker(**kw):
    system_log.info('crawl线程[{}] 启动'.format(threading.current_thread().name))

    crawlers = {}
    for crawl_website, crawl_website_conf in kw.items():
        freq = crawl_website_conf['freq']
        trigger = crawl_website_conf['trigger']
        trigger_part = crawl_website_conf['trigger_part']

        if freq is None or len(freq) <= 0:
            freq = [10, 15]
            try:
                if len(env.env_conf['default_crawl_freq']['value']) > 0:
                    freq = env.env_conf['default_crawl_freq']['value']
            except Exception as ex:
                system_log.error(
                    "get env_conf default_crawl_freq failed {}".format(
                        env.env_conf))
                pass

        env.registCrawler(crawl_website,
                          freq=freq,
                          trigger=trigger,
                          trigger_part=trigger_part)

        crawlers[crawl_website] = eval(crawl_website_conf['class'])()

    while True:

        for crawl_website, cls in crawlers.items():

            if env.crawler_status[crawl_website]['last_run'] <= 0:
                system_log.debug('{} run last_run < 0'.format(crawl_website))
                pass
            else:
                if time.time() - env.crawler_status[crawl_website][
                        'last_run'] <= env.crawler_status[crawl_website][
                            'freq'][0]:
                    system_log.debug('{} run to fast'.format(crawl_website))
                    continue

            try:
                cls.run()
                env.crawler_status[crawl_website]['last_run'] = int(
                    time.time())
                env.crawler_status[crawl_website][
                    'run_counts'] = env.crawler_status[crawl_website][
                        'run_counts'] + 1
            except Exception as ex:
                system_log.error('crawl run error: {} {}'.format(
                    ex, str(traceback.format_exc())))
                continue

        time.sleep(random.randint(1, 3))

        monitor_msg = {
            'type': 'alive',
            'thread_type': 'crawlers',
            'thread_name': threading.current_thread().name,
            'time': time.time(),
        }
        env.monitor_task_queue.put(json.dumps(monitor_msg))
예제 #20
0
파일: app.py 프로젝트: arcsin4/spear
def threadTriggerWorker():
    system_log.info('trigger线程[{}] 启动'.format(threading.current_thread().name))

    item_data_store2 = ItemDataStore()
    while True:
        try:
            try:
                msg = env.trigger_task_queue.get(timeout=1)
                system_log.debug('[' + threading.current_thread().name +
                                 ']: {}'.format(msg))

                msg_data = json.loads(msg)

                website = msg_data['website']

                if env.crawler_status[website]['trigger']:
                    pass
                else:
                    continue

                #website, pid, title, content, jumpurl, news_time, create_time = json.loads(msg)

                trigger_flag = False

                if len(env.crawler_status[website]['trigger_part']) > 0:
                    tp = [
                        msg_data[x]
                        for x in env.crawler_status[website]['trigger_part']
                    ]
                    head_kws = event_trigger.runTrigger(*tp, website=website)

                    if len(head_kws) > 0:
                        trigger_flag = True
                else:
                    head_kws = []
                    trigger_flag = True

                if trigger_flag:
                    origin = website
                    if website in env.websites.keys():
                        origin = env.websites[website]['website_name']

                    notify_msg = {
                        'head_kws': head_kws,
                        'website': website,
                        'pid': msg_data['pid'],
                        'title': msg_data['title'],
                        'content': msg_data['content'],
                        'origin': origin,
                        'jump_url': msg_data['url'],
                        'news_time': msg_data['news_time'],
                    }
                    env.notify_task_queue.put(json.dumps(notify_msg))

                    #columns = ['website','pid', 'trigger_words', 'title','content', 'origin', 'jump_url','news_time','create_time']
                    item_data_store2.saveTriggerMsg([
                        notify_msg['website'], notify_msg['pid'],
                        ','.join(notify_msg['head_kws']), notify_msg['title'],
                        notify_msg['content'], notify_msg['origin'],
                        notify_msg['jump_url'], notify_msg['news_time'],
                        int(time.time())
                    ])

                #time.sleep(1)
            except queue.Empty as ex:
                continue
            except Exception as ex:
                system_log.error('get trigger task queue error:  {} {}'.format(
                    ex, str(traceback.format_exc())))
                #raise
            finally:
                pass
                #env.trigger_task_queue.task_done()
        except Exception as ex:
            system_log.error(
                'operation trigger task queue error:  {} {}'.format(
                    ex, str(traceback.format_exc())))
            #raise

        monitor_msg = {
            'type': 'alive',
            'thread_type': 'trigger',
            'thread_name': threading.current_thread().name,
            'time': time.time(),
        }
        env.monitor_task_queue.put(json.dumps(monitor_msg))