def __init__(self, tab_urls):
        super(Collector, self).__init__()
        self._lock = threading.RLock()

        self._db = MongoDB()
        self._thread_stop = False
        self._urls = []
        self._null_times = 0
        self._read_pos = -1
        self._write_pos = -1
        self._tab_urls = tab_urls
        self._depth = int(
            tools.get_conf_value('config.conf', "collector", "depth"))
        self._max_size = int(
            tools.get_conf_value('config.conf', "collector", "max_size"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        #初始时将正在做的任务至为未做
        self._db.update(self._tab_urls, {'status': Constance.DOING},
                        {'status': Constance.TODO})

        self._finished_callback = None
예제 #2
0
    def __init__(self, tab_urls, depth, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls:
        @param depth:
        @param process_num: 进程编号
        ---------
        @result:
        '''

        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times'))
        self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

        self._tab_worker_status = 'news:worker_status'
        self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')
예제 #3
0
    def __init__(self, collector, tab_urls):
        super(PaserControl, self).__init__()
        self._parsers = []
        self._collector = collector
        self._urlCount = int(tools.get_conf_value('config.conf', "parser", "url_count"))
        self._interval = int(tools.get_conf_value('config.conf', "parser", "sleep_time"))

        self._tab_urls = tab_urls
예제 #4
0
    def __init__(self,
                 tab_urls,
                 tab_site='',
                 tab_content='',
                 parser_count=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key='url',
                 delete_tab_urls=False):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的參數
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        if delete_tab_urls: self._db.delete(tab_urls)

        self._db.set_unique_key(tab_urls, 'url')
        if tab_site: self._db.set_unique_key(tab_site, 'site_id')
        if tab_content:
            self._db.set_unique_key(tab_content, content_unique_key)

        #设置索引 加快查询速度
        self._db.set_ensure_index(tab_urls, 'depth')
        self._db.set_ensure_index(tab_urls, 'status')
        if tab_site: self._db.set_ensure_index(tab_site, 'read_status')
        if tab_content: self._db.set_ensure_index(tab_content, 'read_status')

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
예제 #5
0
def monitor_proxies():
    redis_0 = RedisDB()
    config = os.path.join(os.path.dirname(__file__) + '/../config.conf')
    redis_key = tools.get_conf_value(config, 'redis', 'redis_key')
    redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2')
    sum = redis_0.count(redis_key)
    sum2 = redis_0.count(redis_key2)

    log.debug("douban当前redis库中剩余ip总数:%d" % sum)
    log.debug("weibo当前redis库中剩余ip总数:%d" % sum2)
예제 #6
0
    def __init__(self,
                 tab_urls,
                 tab_site,
                 tab_content,
                 parser_count=None,
                 search_keyword1=[],
                 search_keyword2=[],
                 search_keyword3=[],
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key=None):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param search_keyword1: 搜索关键字(列表)全部包含
        @param search_keyword2: 搜索关键字(列表)至少包含一个
        @param search_keyword3: 搜索关键字(列表)一个都不能包含
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        self._db.set_unique_key(tab_urls, 'url')
        self._db.set_unique_key(tab_site, 'site_id')
        self._db.set_unique_key(
            tab_content,
            'url' if not content_unique_key else content_unique_key)

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._search_keyword1 = search_keyword1
        self._search_keyword2 = search_keyword2
        self._search_keyword3 = search_keyword3

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
예제 #7
0
    def __init__(self,
                 tab_list,
                 tab_unique_key_list,
                 tab_ensure_index_list,
                 parser_count=None,
                 site_parsers=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None,
                 delete_tab_urls=False):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的参数
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()
        self._db = MongoDB()

        self._tab_urls = tab_list[0]
        if delete_tab_urls: self._db.delete(self._tab_urls)

        self._site_parsers = site_parsers

        for tab_index in range(len(tab_list)):
            self._db.set_unique_key(tab_list[tab_index],
                                    tab_unique_key_list[tab_index])
            # 设置索引 加快查询速度
            for ensure_index in tab_ensure_index_list[tab_index]:
                self._db.set_ensure_index(tab_list[tab_index], ensure_index)

        self._collector = Collector(self._tab_urls, self._site_parsers)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
    def __init__(self, collector, tab_images):
        super(ImagePornControl, self).__init__()

        self._collector = collector
        self._tab_images = tab_images

        self._deal_image_count = int(
            tools.get_conf_value('../config.conf', "image_porn",
                                 "deal_image_count"))
        self._interval = int(
            tools.get_conf_value('../config.conf', "image_porn", "sleep_time"))

        self._db = MongoDB()
        self._image_porn_recg = ImagePornRecg()
    def __init__(self, tab_urls, depth):
        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth  # or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False
예제 #10
0
def main():
    search_task_sleep_time = int(
        tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time'))
    # 更新任务状态 正在做的更新为等待
    while True:
        # 查询任务状态 有正在做的 sleep contine
        # TODO

        search_keyword1 = ['hi']
        search_keyword2 = ['hello']
        search_keyword3 = ['hello, hi']
        task_id = 1

        # 任务为空 sleep continue
        # TODO

        def begin_callback():
            log.info('\n********** template begin **********')
            # 更新任务状态 doing

        def end_callback():
            log.info('\n********** template end **********')

            # 更新任务状态 done

            # 导出数据
            # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '')
            # export_data.export_to_oracle()

        # 配置spider
        # spider = Spider(tab_urls = 'template_urls', tab_site = 'template_site_info', tab_content = '', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback)
        spider = Spider(tab_urls='template_urls',
                        tab_site='template_site_info',
                        tab_content='template_content_info',
                        parser_count=1,
                        begin_callback=begin_callback,
                        end_callback=end_callback,
                        search_keyword1=search_keyword1,
                        search_keyword2=search_keyword2,
                        search_keyword3=search_keyword3)

        # 添加parser
        spider.add_parser(xxx_parser)
        spider.add_parser(yyy_parser)

        spider.start()

        # time.sleep(search_task_sleep_time)
        break
예제 #11
0
Created on 2017-08-22 14:06
---------
@summary: 同步oracle数据库到ElasticSearc
---------
@author: Boris
'''

import sys
sys.path.append('../')
import init
import utils.tools as tools
from elasticsearch import Elasticsearch
import elasticsearch.helpers
from utils.log import log

ADDRESS = tools.get_conf_value('config.conf', 'elasticsearch', 'yqtj')


class Singleton(object):
    def __new__(cls, *args, **kwargs):
        if not hasattr(cls, '_inst'):
            cls._inst = super(Singleton, cls).__new__(cls)

        return cls._inst


class ES():
    def __init__(self, address=ADDRESS):
        try:
            print(address.split(','))
            self._es = Elasticsearch(address.split(','))
예제 #12
0
# -*- coding: utf-8 -*-
'''
Created on 2016-11-16 16:25
---------
@summary: 操作mongo数据库
---------
@author: Boris
'''
import sys
sys.path.append('../')
import init
import pymongo
import utils.tools as tools
from utils.log import log

IP = tools.get_conf_value('config.conf', 'mongodb', 'ip')
PORT = int(tools.get_conf_value('config.conf', 'mongodb', 'port'))
DB = tools.get_conf_value('config.conf', 'mongodb', 'db')


class Singleton(object):
    def __new__(cls, *args, **kwargs):
        if not hasattr(cls,'_inst'):
            cls._inst=super(Singleton,cls).__new__(cls, *args, **kwargs)

        return cls._inst

class MongoDB(Singleton):
    def __init__(self, ip = IP, port = PORT, db = DB):
        super(MongoDB, self).__init__()
---------
@author: Boris
'''
import sys
sys.path.append('../')
import init
import cx_Oracle
import utils.tools as tools
from utils.log import log
import datetime
import os
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'  # 防止查出的中文乱码

STOP_ORCL = False  #禁用oracle

IP = tools.get_conf_value('config.conf', 'oracledb', 'ip')
PORT = int(tools.get_conf_value('config.conf', 'oracledb', 'port'))
DB = tools.get_conf_value('config.conf', 'oracledb', 'db')
USER_NAME = tools.get_conf_value('config.conf', 'oracledb', 'user_name')
USER_PASS = tools.get_conf_value('config.conf', 'oracledb', 'user_pass')


class Singleton(object):
    def __new__(cls, *args, **kwargs):
        if not hasattr(cls, '_inst'):
            cls._inst = super(Singleton, cls).__new__(cls, *args, **kwargs)

        return cls._inst


class OracleDB(Singleton):
예제 #14
0
'''
import sys
sys.path.append('..')
import init

import utils.tools as tools
from db.elastic_search import ES
from base.compare_keywords import CompareKeywords
from word_cloud.word_cloud import WordCloud
from base.hot_sync import HotSync
from base.vip_checked import VipChecked
from summary.summary import Summary
from emotion.emotion import Emotion
from utils.log import log

ADDRESS = tools.get_conf_value('config.conf', 'elasticsearch', 'data-pool')
SYNC_TIME_FILE = 'iopm_sync/sync_time.txt'
IOPM_SERVICE_ADDRESS = 'http://localhost:8080/'
SLEEP_TIME = int(tools.get_conf_value('config.conf', 'sync', 'sleep_time'))


class ArticleSync():
    def __init__(self, table):
        self._record_time = tools.get_json(
            tools.read_file(SYNC_TIME_FILE)) or {}
        self._compare_keywords = CompareKeywords()
        self._summary = Summary()
        self._emotion = Emotion()
        self._word_cloud = WordCloud()
        self._es = ES()
        self._hot_sync = HotSync()
예제 #15
0
import sys
sys.path.append('../')
import init
import pid
pid.record_pid(__file__)
import utils.tools as tools
from utils.log import log
from base.spider import Spider
from utils.export_data import ExportData

# 需配置
import news.task_status as task_status
from news.parsers import *

MASTER_ADDRESS = tools.get_conf_value('config.conf', 'master', 'address')
SEARCH_TASK_SLEEP_TIME = int(
    tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time'))


def main():
    while True:
        if task_status.is_doing:
            log.debug('正在做 不取任务')
            tools.delay_time(SEARCH_TASK_SLEEP_TIME)
            continue

        task_status.is_doing = True

        # 查找任务
        get_task_url = MASTER_ADDRESS + '/task/get_task'
예제 #16
0
HEADER = {
    "Query": "String Parameters",
    "view": "URL encoded",
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Cache-Control": "max-age=0",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Host": "qyapi.weixin.qq.com"
}

ORGANIZATION = tools.get_conf_value('config.conf', 'wechat', 'organization')


class WechatService():
    _depertment_id = None

    def __init__(self, corpid, send_msg_secret, sync_user_sercet, agentid):
        self._agentid = agentid
        self._send_msg_access_token = self.get_access_token(
            corpid, send_msg_secret)
        self._sync_user_access_token = self.get_access_token(
            corpid, sync_user_sercet)

        if not WechatService._depertment_id:
            WechatService._depertment_id = self.get_depertment_id(ORGANIZATION)
            if not WechatService._depertment_id:  # 通讯录中无此部门,新创建
예제 #17
0
        } while ( i < a && n == - 1 );
        if (n == -1) break;
        u += String.fromCharCode((15 & r) << 4 | (60 & n) >> 2);
        do {
            if (o = 255 & e.charCodeAt(i++), 61 == o) return u;
            o = l[o]
        } while ( i < a && o == - 1 );
        if (o == -1) break;
        u += String.fromCharCode((3 & n) << 6 | o)
    }
    return u
}
'''

ONE_PAGE_TIME_INTERVAL = 3600
FILE_LOCAL_PATH = tools.get_conf_value('config.conf', 'files',
                                       'headlines_save_path')
NEWS_LOCAL = 1
VIDEO = 2
STORAGE_ID = 2


# 必须定义 添加网站信息
@tools.run_safe_model(__name__)
def add_site_info():
    log.debug('添加网站信息')
    site_id = SITE_ID
    name = NAME
    table = 'VAApp_site_info'
    url = 'http://sj.qq.com/myapp/detail.htm?apkName=com.ss.android.article.news'

    base_parser.add_website_info(table, site_id, url, name)
예제 #18
0
---------
@summary: 同步oracle数据库到ElasticSearc
---------
@author: Boris
'''

import sys

sys.path.append('../')
import init
import utils.tools as tools
from elasticsearch import Elasticsearch
import elasticsearch.helpers
from utils.log import log

ADDRESS = tools.get_conf_value('config.conf', 'elasticsearch', 'address')


class Singleton(object):
    def __new__(cls, *args, **kwargs):
        if not hasattr(cls, '_inst'):
            cls._inst = super(Singleton, cls).__new__(cls)

        return cls._inst


class ES():
    def __init__(self, address=ADDRESS):
        try:
            print(address.split(','))
            self._es = Elasticsearch(address.split(','))
예제 #19
0
import collections

from utils.log import log
import utils.tools as tools
import web
import json
import random
from service.wechat_service import WechatService

MIN_SLEEP_TIME = 30000  # 每个历史列表、文章详情时间间隔  毫秒
MAX_SLEEP_TIME = 65000
MIN_WAIT_TIME = 1000 * 60 * 60 * 6  # 做完所有公众号后休息的时间,然后做下一轮
MAX_WAIT_TIME = 1000 * 60 * 60 * 8

ONLY_TODAY_MSG = int(
    tools.get_conf_value('config.conf', 'spider', 'only_today_msg'))
SPIDER_START_TIME = tools.get_conf_value('config.conf', 'spider',
                                         'spider_start_time')


class WechatAction():
    _wechat_service = WechatService()
    _todo_urls = collections.deque()  # 待做的url

    _article_info = {  # 缓存文章信息,第一次缓存列表信息、第二次缓存观看量点赞量,第三次直到评论信息也取到后,则入库
        "article_id": {
            "title": "",
            "content": "",
            #....
        }
    }
예제 #20
0
# -*- coding: utf-8 -*-
'''
Created on 2017-12-11 15:13
---------
@summary: 同步新闻
---------
@author: Administrator
'''
import sys
sys.path.append('..')
import init

import utils.tools as tools
from db.elastic_search import ES

ADDRESS = tools.get_conf_value('config.conf', 'elasticsearch', 'data-pool')
SYNC_TIME_FILE = 'iopm_sync/sync_time.txt'

class NewsSync():
    def __init__(self):
        self._record_time = tools.read_file(SYNC_TIME_FILE) or {}

    def _get_per_record_time(self):
        news_record_time = ''
        news_record_time = tools.get_json(self._record_time).get('news_record_time')

        return news_record_time

    def _record_now_record_time(self, record_time):
        self._record_time['news_record_time'] = record_time
        tools.write_file(SYNC_TIME_FILE, tools.dumps_json(self._record_time))
예제 #21
0
import collections

from utils.log import log
import utils.tools as tools
from db.oracledb import OracleDB
from db.elastic_search import ES
from base.wechat_sogou import WechatSogou
from base.wechat_public_platform import WechatPublicPlatform
from base import constance
import random

SIZE = 100
TIME_INTERVAL = 24 * 60 * 60

CHECK_NEW_ARTICLE = int(
    tools.get_conf_value('config.conf', 'spider',
                         'only_today_msg'))  # 有新发布的文章才爬取


class WechatService():
    _db = OracleDB()
    _es = ES()
    _wechat_sogou = WechatSogou()
    _wechat_public_platform = WechatPublicPlatform()

    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False  # 做完一轮
    _is_all_done = False  # 所有账号当日发布的消息均已爬取

    # wechat_sogou 最后没被封的时间
import init
import base.constance as Constance
import base.base_parser as base_parser
import utils.tools as tools
from utils.log import log
from db.mongodb import MongoDB
from db.oracledb import OracleDB

SITE_ID = 10004
search_type = 102
NAME = '新浪微博'

db = MongoDB()
oracledb = OracleDB()
FILE_LOCAL_PATH = tools.get_conf_value('config.conf', 'files',
                                       'wwa_save_path') + 'weibo/'


def get_release_time(mblog):
    try:
        release_time = mblog['created_at']
        data = tools.time.time()
        ltime = tools.time.localtime(data)
        timeStr = tools.time.strftime("%Y-%m-%d", ltime)
        if tools.re.compile('今天').findall(release_time):
            release_time = release_time.replace('今天', '%s' % timeStr)
        elif tools.re.compile('小时前').findall(release_time):
            nhours = tools.re.compile('(\d+)小时前').findall(release_time)
            hours_ago = (tools.datetime.datetime.now() -
                         tools.datetime.timedelta(hours=int(nhours[0])))
            release_time = hours_ago.strftime("%Y-%m-%d %H:%M")
예제 #23
0
sys.path.append('../../')

import base.base_parser as base_parser
import init
import utils.tools as tools
from utils.log import log
import base.constance as Constance
import re
import time

# 必须定义 网站id
SITE_ID = 6
# 必须定义 网站名
NAME = '酷6视频'

FILE_LOCAL_PATH = tools.get_conf_value('config.conf', 'files',
                                       'program_save_path')


# 必须定义 添加网站信息
@tools.run_safe_model(__name__)
def add_site_info():
    log.debug('添加网站信息')
    site_id = SITE_ID
    name = NAME
    table = 'PROGRAM_site_info'
    url = "http://news.v1.cn/V1make.shtml"
    base_parser.add_website_info(table, site_id, url, name)


# 必须定义 添加根url
@tools.run_safe_model(__name__)
예제 #24
0
'''
Created on 2017-12-29 10:44
---------
@summary: 筛选符合省内的信息
---------
@author: Boris
'''
import sys
sys.path.append('../')
import init

import utils.tools as tools
from utils.log import log
from db.oracledb import OracleDB

PROVINCE = tools.get_conf_value('config.conf', 'province', 'province')


class ProvinceFilter():
    def __init__(self, province_name=PROVINCE):
        self._province_airs = []
        self._db = OracleDB()
        if province_name:
            self._province_airs.append(province_name)
            province_id = self.load_province_id(province_name)
            if province_id:
                self._province_airs.extend(
                    air[0] for air in self.load_province_air(province_id))
                # self._province_airs.extend(town[0] for town in self.load_province_town(province_id))
        else:  # 全国
            self._province_airs.extend(province[0]
예제 #25
0
---------
@author: Yongxin_Yang
'''
import sys

sys.path.append("../")
from aiohttp_requests import requests
import time
from db.redisdb import RedisDB
from utils.log import log
from utils import tools
import os
import asyncio

config = os.path.join(os.path.dirname(__file__) + '/../config.conf')
redis_key = tools.get_conf_value(config, 'redis', 'redis_key')


class Detection(object):
    def __init__(self):
        self.redis = RedisDB()
        self.test_url = "https://movie.douban.com/"

    #@tools.debug
    async def get_html(self, root_url, proxy, semaphore):
        try:
            test_proxy = "http://" + proxy
            log.debug("正在测试代理:" + test_proxy)
            async with semaphore:
                response = await requests.get(root_url,
                                              proxy=test_proxy,
예제 #26
0
sys.path.append('../../')

import base.base_parser as base_parser
import news.parsers.base_parser as self_base_parser
import init
import utils.tools as tools
from utils.log import log
import base.constance as Constance
from extractor.article_extractor import ArticleExtractor
# print(article_extractor.article_extractor)
# 必须定义 网站id
SITE_ID = 1
# 必须定义 网站名
NAME = '新闻正文提取'

DEPTH = int(tools.get_conf_value('config.conf', "collector", "depth"))


# 必须定义 添加网站信息
@tools.run_safe_model(__name__)
def add_site_info():
    log.debug('添加网站信息')
    pass


# 必须定义 添加根url
@tools.run_safe_model(__name__)
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
예제 #27
0
---------
@summary: 同步新闻
---------
@author: Administrator
'''
import sys
sys.path.append('..')
import init
import pid
pid.record_pid(__file__)

import utils.tools as tools
from utils.log import log
from base.article_sync import ArticleSync

SLEEP_TIME = int(tools.get_conf_value('config.conf', 'sync', 'sleep_time'))


class VideoSync(ArticleSync):
    def __init__(self):
        super(VideoSync, self).__init__('video_news')

    @tools.log_function_time
    def deal_video_article(self, video_news_list):
        '''
        @summary:处理视频
        ---------
        @param video_news_list:
        # video_news:
           {
                "time_length":null,
예제 #28
0
@author: Administrator
'''
import sys
sys.path.append('..')
import init

import utils.tools as tools
from db.elastic_search import ES
from cluster.compare_text import compare_text
from copy import deepcopy
from base.hot_week_sync import HotWeekSync
import random
from utils.cut_text import CutText

MIN_SIMILARITY = 0.5 # 相似度阈值
IOPM_SERVICE_ADDRESS = tools.get_conf_value('config.conf', 'iopm_service', 'address')

INFO_WEIGHT = {
    1: 6, # 新闻
    2: 2, # 微信
    3: 1,  # 微博
    8: 1,  # 视频
}

class HotSync():
    def __init__(self):
        self._es = ES()
        self._hot_week_sync = HotWeekSync()
        self._cut_text = CutText()
        self._cut_text.set_stop_words('utils/stop_words.txt')
예제 #29
0
def main():
    search_task_sleep_time = int(
        tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time'))

    db = OracleDB()

    #  更新符合日期条件的任务状态 未做
    sql = 'update tab_ivms_task_info t set t.task_status = 501 where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time'
    db.update(sql)

    # 更新关键词状态 未做
    sql = 'update tab_ivms_task_keyword k set k.finish_status = 601 where k.task_id in (select t.task_id from tab_ivms_task_info t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time)'
    db.update(sql)

    while True:
        # 查任务
        log.debug('查询任务...')

        sql = 'select t.task_id from TAB_IVMS_TASK_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and t.task_status = 501'
        result = db.find(sql, fetch_one=True)
        if not result:
            break

        task_id = result[0]

        while True:
            # 查看是否有正在执行的任务
            sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 602' % task_id
            do_task = db.find(sql, fetch_one=True)
            if do_task:
                time.sleep(search_task_sleep_time)
                continue

            sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 601' % task_id
            result = db.find(sql, fetch_one=True)
            if not result:
                break

            keyword_id = result[0]
            task_id = result[1]
            search_keyword1 = []
            search_keyword2 = result[2].split(',') if result[2] else []
            search_keyword3 = result[3].split(',') if result[3] else []

            def begin_callback():
                log.info('\n********** VA begin **********')
                # 更新任务状态 正在做
                sql = 'update TAB_IVMS_TASK_INFO set task_status = 502 where task_id = %d' % task_id
                db.update(sql)

                # 更新关键词状态 正在做
                sql = 'update tab_ivms_task_keyword set finish_status = 602 where id = %d' % keyword_id
                db.update(sql)

            def end_callback():
                # 更新关键词状态 做完
                sql = 'update tab_ivms_task_keyword set finish_status = 603 where id = %d' % keyword_id
                db.update(sql)

                # 如果该任务的所有关键词都做完 则更新任务状态为做完
                sql = 'select t.* from tab_ivms_task_keyword t where task_id = %d and finish_status = 601' % task_id
                results = db.find(sql)
                if not results:
                    # 导出数据
                    key_map = {
                        'program_id': 'vint_sequence.nextval',
                        'search_type': 'int_search_type',
                        'program_name': 'str_title',
                        'program_url': 'str_url',
                        'release_date': 'date_release_time',
                        'image_url': 'str_image_url',
                        'program_content': 'str_content',
                        'task_id': 'vint_%d' % task_id,
                        'keyword': 'str_keyword',
                        'keyword_count': 'int_keyword_count',
                        'check_status': 'vint_202'
                    }

                    export = ExportData('VA_content_info',
                                        'tab_ivms_program_info', key_map,
                                        'program_url')
                    export.export_to_oracle()

                    # 更新任务状态 做完
                    sql = 'update TAB_IVMS_TASK_INFO set task_status = 503 where task_id = %d' % task_id
                    db.update(sql)
                    log.info('\n********** VA end **********')

            # 配置spider
            spider = Spider(tab_urls='VA_urls',
                            tab_site='VA_site_info',
                            tab_content='VA_content_info',
                            parser_count=1,
                            begin_callback=begin_callback,
                            end_callback=end_callback,
                            search_keyword1=search_keyword1,
                            search_keyword2=search_keyword2,
                            search_keyword3=search_keyword3)

            # 添加parser
            spider.add_parser(baidu_parser)
            spider.add_parser(magnet_parser)
            spider.add_parser(netdisk_parser)
            spider.add_parser(weibo_parser)
            spider.add_parser(wechat_parser)
            spider.add_parser(soubaidupan_parser)
            spider.add_parser(douban_parser)

            spider.start()

            time.sleep(search_task_sleep_time)
예제 #30
0
sys.path.append('..')
import init

import utils.tools as tools
from db.elastic_search import ES
from base.compare_keywords import CompareKeywords
from word_cloud.word_cloud import WordCloud
from base.hot_sync import HotSync
from base.vip_checked import VipChecked
from summary.summary import Summary
from emotion.emotion import Emotion
from utils.log import log
from base.province_filter import ProvinceFilter
from base.event_filter import EventFilter  # 标记事件类别 移到每周热点

DATA_POOL = tools.get_conf_value('config.conf', 'elasticsearch', 'data-pool')
YQTJ = tools.get_conf_value('config.conf', 'elasticsearch', 'yqtj')
PROVINCE = tools.get_conf_value('config.conf', 'province', 'province')
IOPM_SERVICE_ADDRESS = tools.get_conf_value('config.conf', 'iopm_service',
                                            'address')
SLEEP_TIME = int(tools.get_conf_value('config.conf', 'sync', 'sleep_time'))

SYNC_TIME_FILE = 'iopm_sync/sync_time/'


class ArticleSync():
    def __init__(self, table):
        self._sync_time_file = SYNC_TIME_FILE + table + '.txt'
        self._record_time = tools.get_json(
            tools.read_file(self._sync_time_file)) or {}
        self._compare_keywords = CompareKeywords()