class Collector(threading.Thread):
    def __init__(self, tab_urls):
        super(Collector, self).__init__()
        self._lock = threading.RLock()

        self._db = MongoDB()
        self._thread_stop = False
        self._urls = []
        self._null_times = 0
        self._read_pos = -1
        self._write_pos = -1
        self._tab_urls = tab_urls
        self._depth = int(
            tools.get_conf_value('config.conf', "collector", "depth"))
        self._max_size = int(
            tools.get_conf_value('config.conf', "collector", "max_size"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        #初始时将正在做的任务至为未做
        self._db.update(self._tab_urls, {'status': Constance.DOING},
                        {'status': Constance.TODO})

        self._finished_callback = None

    def run(self):
        while not self._thread_stop:
            self.__input_data()
            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True

        if self._finished_callback:
            self._finished_callback()

    @tools.log_function_time
    def __input_data(self):
        log.debug('read_pos %d, write_pos %d buffer size %d' %
                  (self._read_pos, self._write_pos, self.get_max_read_size()))
        log.debug('buffer can write size = %d' % self.get_max_write_size())
        if self.get_max_write_size() == 0:
            log.debug("collector 已满 size = %d" % self.get_max_read_size())
            return

        url_count = self._url_count if self._url_count <= self.get_max_write_size(
        ) else self.get_max_write_size()

        urls_list = []
        if self._depth:
            urls_list = self._db.find(self._tab_urls, {
                "status": Constance.TODO,
                "depth": {
                    "$lte": self._depth
                }
            },
                                      limit=url_count)
        else:
            urls_list = self._db.find(self._tab_urls,
                                      {"status": Constance.TODO},
                                      limit=url_count)

        #更新已取到的url状态为doing
        for url in urls_list:
            self._db.update(self._tab_urls, url, {'status': Constance.DOING})

        # 存url
        self.put_urls(urls_list)

        if self.is_all_have_done():
            self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        if self.get_max_read_size() == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times:
                #检查数据库中有没有正在做的url
                urls_doing = self._db.find(self._tab_urls,
                                           {'status': Constance.DOING})
                if urls_doing:
                    self._null_times = 0
                    return False
                else:
                    return True
        else:
            self._null_times = 0
            return False

    def get_max_write_size(self):
        size = 0
        if self._read_pos == self._write_pos:
            size = self._max_size
        elif self._read_pos < self._write_pos:
            size = self._max_size - (self._write_pos - self._read_pos)
        else:
            size = self._read_pos - self._write_pos

        return size - 1

    def get_max_read_size(self):
        return self._max_size - 1 - self.get_max_write_size()

    @tools.log_function_time
    def put_urls(self, urls_list):
        if urls_list == []:
            return

        # 添加url 到 _urls
        url_count = len((urls_list))
        end_pos = url_count + self._write_pos + 1
        # 判断是否超出队列容量 超出的话超出的部分需要从头写
        # 超出部分
        overflow_end_pos = end_pos - self._max_size
        # 没超出部分
        in_pos = end_pos if end_pos <= self._max_size else self._max_size

        # 没超出部分的数量
        urls_listCutPos = in_pos - self._write_pos - 1

        self._lock.acquire()  #加锁

        self._urls[self._write_pos + 1:in_pos] = urls_list[:urls_listCutPos]
        if overflow_end_pos > 0:
            self._urls[:overflow_end_pos] = urls_list[urls_listCutPos:]

        self._lock.release()

        self._write_pos += url_count
        self._write_pos %= self._max_size  # -1 取余时问题  -1 % 1000 = 999  这样can write size 为0 urls_list为空时返回 规避了这个问题

    @tools.log_function_time
    def get_urls(self, count):
        self._lock.acquire()  #加锁
        urls = []

        count = count if count <= self.get_max_read_size(
        ) else self.get_max_read_size()
        end_pos = self._read_pos + count + 1
        if end_pos > self._max_size:
            urls.extend(self._urls[self._read_pos + 1:])
            urls.extend(self._urls[:end_pos % self._max_size])
        else:
            urls.extend(self._urls[self._read_pos + 1:end_pos])

        if urls:
            self._read_pos += len(urls)
            self._read_pos %= self._max_size

        self._lock.release()

        return urls
class ExportData():
    def __init__(self, source_table, aim_table, key_map, unique_key=None):
        '''
        @summary: 初始化
        ---------
        @param source_table: 源table
        @param aim_table:    目标table
        @param key_map:      目标table 和 源table 的键的映射
        eg: key_map = {
            'aim_key1' : 'str_source_key2',          # 目标键 = 源键对应的值         类型为str
            'aim_key2' : 'int_source_key3',          # 目标键 = 源键对应的值         类型为int
            'aim_key3' : 'date_source_key4',         # 目标键 = 源键对应的值         类型为date
            'aim_key4' : 'vint_id',                  # 目标键 = 值                   类型为int
            'aim_key5' : 'vstr_name',                # 目标键 = 值                   类型为str
            'aim_key6' : 'sint_select id from xxx'   # 目标键 = 值为sql 查询出的结果 类型为int
            'aim_key7' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str
        }

        @param unique_key:    唯一的key 目标数据库根据该key去重
        ---------
        @result:
        '''

        super(ExportData, self).__init__()

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key

        self._mongodb = MongoDB()

        self._is_oracle = False
        self._export_count = 0

    def export_to_oracle(self):
        self._aim_db = OracleDB()
        self._is_oracle = True
        self.__export()

    def export_to_mysql(self):
        self._aim_db = MysqlDB()
        self.__export()

    # @tools.run_safe_model(__name__)
    def __export(self):
        if self._unique_key:
            self._aim_db.set_unique_key(self._aim_table, self._unique_key)

        aim_keys = tuple(self._key_map.keys())
        source_keys = tuple(self._key_map.values())

        # 取源key值 对应的type 和 key (源key包含type 和 key 信息)
        keys = []
        value_types = []
        for source_key in source_keys:
            temp_var = source_key.split('_', 1)
            value_types.append(temp_var[0])
            keys.append(temp_var[1])

        datas = self._mongodb.find(self._source_table, {'read_status': 0})
        for data in datas:
            sql = 'insert into ' + self._aim_table + " (" + ', '.join(
                aim_keys) + ") values ("
            values = []
            for i in range(len(keys)):
                if value_types[i] == 'str':
                    values.append(data[keys[i]].replace(
                        "'", "''"))  # 将单引号替换成两个单引号 否者sql语句语法出错
                    sql += "'%s', "

                elif value_types[i] == 'int':
                    values.append(data[keys[i]])
                    if isinstance(data[keys[i]], int):
                        sql += '%d, '
                    else:
                        sql += '%s, '

                elif value_types[i] == 'date':
                    values.append(data[keys[i]])
                    if self._is_oracle:
                        sql += "to_date('%s','yyyy-mm-dd hh24:mi:ss'), "
                    else:
                        sql += "'%s', "

                elif value_types[i] == 'vint':
                    values.append(keys[i])
                    sql += '%s, '

                elif value_types[i] == 'vstr':
                    values.append(keys[i])
                    sql += "'%s', "

                elif value_types[i] == 'sint':
                    value = self._oracledb.find(keys[i], fetch_one=True)
                    values.append(value)
                    sql += '%d, '

                elif value_types[i] == 'sstr':
                    value = self._oracledb.find(keys[i], fetch_one=True)
                    values.append(value)
                    sql += "'%s', "

                else:
                    log.error('%s不符合key_map规定格式' % value_types[i])
                    return

            sql = sql[:-2] + ")"
            sql = sql % tuple(values)

            log.debug(sql)
            if self._aim_db.add(sql):
                self._export_count += 1
                self._mongodb.update(self._source_table, data,
                                     {'read_status': 1})

        # self._aim_db.close()
        log.debug('共导出%d条数据' % self._export_count)
예제 #3
0
def download(title=None):
    if title is not None:
        zhuishu = ZhuiShuSpider()
        # babadushu = BaBaDuShuSpider()
        # liewen = LieWenSpider()
        mongodb = MongoDB('zhuishu')
        # 想要爬取哪个网站,就把哪个网站的爬虫实现类传入,比如,我这里传入的是 追书网实例
        novel = spider.Spider(zhuishu, mongodb)
        novel.search(title)
    else:
        print('请输入要下载的小说名')


def make_txt(book):
    converter = ConvertToTxt(book['title'])
    g = generator.Generator(converter)
    g.make(book)


def make_epub(book):
    epub = ConvertToEpub(book['title'], book['author'], book['cover'],
                         book['intro'], book['chapters'])
    epub.make()


if __name__ == '__main__':
    mongodb = MongoDB('zhuishu')
    book = mongodb.find(title='天将夜').next()
    make_epub(book)
예제 #4
0
파일: dbshot.py 프로젝트: lychlov/MPWatcher
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
   File Name:     dbshot
   Description :
   Author :       Lychlov
   date:          2018/5/24
-------------------------------------------------
   Change Activity:
                   2018/5/24:
-------------------------------------------------
"""
from db.mongodb import MongoDB

temp_dict = {"title": "政变四周年,曼谷反军方大示威今日正式爆发!",
             "summary": 'jianjie',
             "cover": "http://sdfsdf",
             "receive_time": "2018-05-23 23:23:23",
             "account": '泰国网'}
temp_dict2 = {"title": "政变四周年,曼谷反军方大示威今日正式爆发!",
              "summary": 'jianjie',
              "account": '泰国网'}
mongodb = MongoDB()
add_res = mongodb.add('wechat_article', temp_dict)

res = mongodb.find('wechat_article', temp_dict2)
print(res)
예제 #5
0
class ExportData():
    INSERT = 1
    UPDATE = 2
    EXCEPTION = 3

    def __init__(self,
                 source_table='',
                 aim_table='',
                 key_map='',
                 unique_key=None,
                 unique_key_mapping_source_key=None,
                 update_read_status=True,
                 condition={'read_status': 0},
                 datas=[],
                 callback='',
                 sync_to_es=False):
        '''
        @summary: 初始化
        ---------
        @param source_table: 源table mongo数据库
        @param aim_table:    目标table
        @param key_map:      目标table 和 源table 的键的映射
        eg: key_map = {
            'aim_key1' : 'str_source_key2',          # 目标键 = 源键对应的值         类型为str
            'aim_key2' : 'int_source_key3',          # 目标键 = 源键对应的值         类型为int
            'aim_key3' : 'date_source_key4',         # 目标键 = 源键对应的值         类型为date
            'aim_key4' : 'vint_id',                  # 目标键 = 值                   类型为int
            'aim_key5' : 'vstr_name',                # 目标键 = 值                   类型为str
            'aim_key6' : 'vdate_name',               # 目标键 = 值                   类型为date
            'aim_key7' : 'sint_select id from xxx'   # 目标键 = 值为sql 查询出的结果 类型为int
            'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str
            'aim_key9' : 'clob_key8'                 # 目标键 = 源键对应的值         类型为clob
            'aim_key10' : 'clob_key8'                # 目标键 = 源键对应的值         类型为str
        }

        @param unique_key:    唯一的key 目标数据库根据该key去重
        @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据
         eg: unique_key_mapping_source_key = {
            'url':'str_url'                         # 目标键 = 源键对应的值         类型为str
         }
        @param condition:    导出满足什么样条件的数据 默认是read_status = 0 的
        @param datas:   要导出的数据,格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表,为空时默认导出mongodb的数据
        @param callback 导出数据的回调,导出一组,执行一次,callback(execute_type, sql) execute_type为执行类型(ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION)
        sql 为执行的语句
        ---------
        @result:
        '''

        super(ExportData, self).__init__()

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._update_read_status = update_read_status
        self._condition = condition

        self._mongodb = MongoDB() if self._source_table else ''
        self._datas = datas
        self._sync_to_es = sync_to_es
        self._callback = callback

        self._is_oracle = False
        self._is_set_unique_key = False
        self._is_set_unique_key = False
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key

    def export_to_oracle(self,
                         source_table='',
                         aim_table='',
                         key_map='',
                         unique_key=None,
                         unique_key_mapping_source_key=None,
                         update_read_status=True,
                         condition={'read_status': 0},
                         datas=[],
                         callback='',
                         sync_to_es=False):
        if aim_table:
            if self._aim_table != aim_table:
                self._is_set_unique_key = False
                self._es = ES() if sync_to_es else ''
                self._mongodb = MongoDB() if source_table else ''

            self._source_table = source_table
            self._aim_table = aim_table
            self._key_map = key_map
            self._unique_key = unique_key
            self._export_count = 0
            self._update_count = 0
            self._unique_key_mapping_source_key = unique_key_mapping_source_key
            self._update_read_status = update_read_status if not datas else False
            self._condition = condition
            self._datas = datas
            self._callback = callback
            self._sync_to_es = sync_to_es
            self._es = None

        self._aim_db = OracleDB()
        self._is_oracle = True

        return self.__export()

    def export_to_mysql(self,
                        source_table='',
                        aim_table='',
                        key_map='',
                        unique_key=None,
                        unique_key_mapping_source_key=None,
                        update_read_status=True,
                        condition={'read_status': 0},
                        datas=[],
                        callback=''):
        if self._aim_table != aim_table:
            self._is_set_unique_key = False

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key
        self._update_read_status = update_read_status if not datas else False
        self._condition = condition
        self._datas = datas
        self._callback = callback

        self._aim_db = MysqlDB()
        return self.__export()

    def make_sql(self, data):
        '''
        @summary:
        ---------
        @param data: 数据字典
        ---------
        @result: 当unique_key_mapping_source_key不为空时返回insert_sql, update_sql 否则返回insert_sql
        '''
        aim_keys = tuple(self._key_map.keys())
        source_keys = tuple(self._key_map.values())

        # 取源key值 对应的type 和 key (源key包含type 和 key 信息)
        keys = []
        value_types = []
        for source_key in source_keys:
            temp_var = source_key.split('_', 1)
            value_types.append(temp_var[0])
            keys.append(temp_var[1])

        insert_sql = 'insert into ' + self._aim_table + " (" + ', '.join(
            aim_keys) + ") values ("
        update_sql = 'update ' + self._aim_table + " set "
        data_json = {}  # 导入到es中用
        values = []
        for i in range(len(keys)):
            if (value_types[i] != 'vint' and value_types[i] != 'vstr'
                    and value_types[i] != 'vdate' and value_types[i] != 'sint'
                    and value_types[i] != 'sstr') and (not data[keys[i]]
                                                       and data[keys[i]] != 0):
                values.append('null')
                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]
                data_json[aim_keys[i].upper()] = None

            elif value_types[i] == 'str':
                values.append(
                    str(data[keys[i]]).replace("'", "''")
                )  # if isinstance(data[keys[i]], str) else data[keys[i]])  # 将单引号替换成两个单引号 否者insert_sql语句语法出错
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % values[-1]
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'clob':
                text = str(data[keys[i]]).replace("'", "''")
                if not text:
                    insert_sql += "'%s', "
                    values.append(text)
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = None
                else:
                    values_ = tools.cut_string(text, 1000)

                    clob_text = ''
                    for value in values_:
                        clob_text += "to_clob('%s') || " % value

                    clob_text = clob_text[:-len(' || ')]
                    values.append(clob_text)
                    insert_sql += "%s, "
                    update_sql += aim_keys[i] + " = %s, " % values[-1]
                    data_json[aim_keys[i].upper()] = data[keys[i]]

            elif value_types[i] == 'int':
                if isinstance(data[keys[i]], int) or isinstance(
                        data[keys[i]], float) or isinstance(
                            data[keys[i]], str):
                    values.append(data[keys[i]])
                elif isinstance(data[keys[i]], bool):
                    values.append(data[keys[i]] and 1 or 0)
                else:  # _id
                    values.append(int(str(data[keys[i]])[-6:], 16))

                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]
                data_json[aim_keys[i].upper()] = eval(
                    values[-1]) if isinstance(values[-1], str) else values[-1]

            elif value_types[i] == 'date':
                values.append(data[keys[i]].replace('年', '-').replace(
                    '月', '-').replace('日', ''))
                if self._is_oracle:
                    format_date = 'yyyy-mm-dd hh24:mi:ss'[:len(
                        values[-1]) if len(values[-1]) <= 10 else None]
                    insert_sql += "to_date('%s','{}'), ".format(format_date)
                    update_sql += aim_keys[i] + "= to_date('%s','%s'), " % (
                        values[-1], format_date)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    insert_sql += "'%s', "
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'vint':
                if tools.get_english_words(keys[i]):
                    sql = 'select %s from dual' % keys[i]
                    value = self._aim_db.find(sql)[0][0]
                    values.append(value)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    values.append(keys[i])
                    data_json[aim_keys[i].upper()] = eval(values[-1])

                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % values[-1]

            elif value_types[i] == 'vstr':
                values.append(keys[i])
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % values[-1]
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'vdate':
                values.append(keys[i])
                if self._is_oracle:
                    format_date = 'yyyy-mm-dd hh24:mi:ss'[:len(
                        values[-1]) if len(values[-1]) <= 10 else None]
                    insert_sql += "to_date('%s','{}'), ".format(format_date)
                    update_sql += aim_keys[i] + "= to_date('%s','%s'), " % (
                        values[-1], format_date)
                    data_json[aim_keys[i].upper()] = values[-1]
                else:
                    insert_sql += "'%s', "
                    update_sql += aim_keys[i] + " = '%s', " % values[-1]
                    data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'sint':
                value = self._aim_db.find(keys[i], fetch_one=True)[0]
                values.append(value)
                insert_sql += '%s, '
                update_sql += aim_keys[i] + " = %s, " % value
                data_json[aim_keys[i].upper()] = values[-1]

            elif value_types[i] == 'sstr':
                value = self._aim_db.find(keys[i], fetch_one=True)[0]
                values.append(value)
                insert_sql += "'%s', "
                update_sql += aim_keys[i] + " = '%s', " % value
                data_json[aim_keys[i].upper()] = values[-1]

            else:
                error_msg = '%s不符合key_map规定格式' % value_types[i]
                raise (Exception(error_msg))

        insert_sql = insert_sql[:-2] + ")"
        insert_sql = insert_sql % tuple(values)
        # tools.print(data_json)

        # log.debug(insert_sql)
        if self._unique_key_mapping_source_key:
            # aim_key = tuple(self._unique_key_mapping_source_key.keys())[0]

            # value = tuple(self._unique_key_mapping_source_key.values())[0]
            # temp_var = value.split('_', 1)

            # source_key_types = temp_var[0]
            # source_key = temp_var[1]

            # if source_key_types == 'str':
            #     update_sql = update_sql[:-2] + " where %s = '%s'" %(aim_key, data[source_key])
            # elif source_key_types == 'int':
            #     update_sql = update_sql[:-2] + " where %s = %s" %(aim_key, data[source_key])

            # # log.debug(update_sql)

            return insert_sql, update_sql[:-2], data_json
        else:
            return insert_sql, data_json

    # @tools.run_safe_model(__name__)
    def __export(self):
        if self._unique_key and not self._is_set_unique_key:
            self._aim_db.set_unique_key(self._aim_table, self._unique_key)
            self._is_set_unique_key = True

        datas = self._mongodb.find(
            self._source_table,
            condition=self._condition) if self._mongodb else (
                self._datas
                if isinstance(self._datas, list) else [self._datas])
        for data in datas:
            if self._unique_key_mapping_source_key:
                insert_sql, update_sql, data_json = self.make_sql(data)
            else:
                insert_sql, data_json = self.make_sql(data)

            # tools.write_file(self._aim_table + '.txt', insert_sql, 'w+')
            def exception_callfunc(e):
                if 'ORA-00001' in str(e):
                    if self._update_read_status:
                        self._mongodb.update(self._source_table, data,
                                             {'read_status': 1})
                else:
                    log.error(insert_sql)

            execute_type = ExportData.EXCEPTION
            sql = ''
            # log.debug(insert_sql)
            if self._aim_db.add(insert_sql, exception_callfunc):
                self._export_count += 1
                sql = insert_sql
                execute_type = ExportData.INSERT

                if self._update_read_status:
                    self._mongodb.update(self._source_table, data,
                                         {'read_status': 1})

            elif self._unique_key_mapping_source_key:
                # 取id字段
                aim_key = tuple(self._unique_key_mapping_source_key.keys())[0]

                value = tuple(self._unique_key_mapping_source_key.values())[0]
                temp_var = value.split('_', 1)

                source_key_types = temp_var[0]
                source_key = temp_var[1]

                select_sql = 'select id from ' + self._aim_table
                if source_key_types == 'str':
                    select_sql = select_sql + " where %s = '%s'" % (
                        aim_key, data[source_key])
                elif source_key_types == 'int':
                    select_sql = select_sql + " where %s = %s" % (
                        aim_key, data[source_key])

                data_id = self._aim_db.find(select_sql)
                if data_id:
                    data_id = data_id[0][0]
                else:
                    continue

                #拼接update语句
                update_sql += " where id = %s" % data_id
                log.debug(update_sql)

                # 删除 update 里面 id= xxx 的条件,保证更新后的数据 ID不变
                id_info = ''.join(
                    tools.get_info(update_sql, [' id .*?,', ' ID .*?,']))
                update_sql = update_sql.replace(id_info, '')

                # 修改data_json 里的ID
                if "ID" in data_json.keys():
                    data_json["ID"] = data_id

                # 更新
                if self._aim_db.update(update_sql):
                    self._update_count += 1
                    sql = update_sql
                    execute_type = ExportData.UPDATE

                    if self._update_read_status:
                        self._mongodb.update(self._source_table, data,
                                             {'read_status': 1})

            # 同步到ES
            if self._sync_to_es and execute_type != ExportData.EXCEPTION:
                self._es.add(table=self._aim_table,
                             data=data_json,
                             data_id=data_json.get('ID'))

            if self._callback:
                self._callback(execute_type, sql, data_json)

        log.debug('''
            共导出%s条数据
            共更新%s条数据
            ''' % (self._export_count, self._update_count))

        return self._export_count + self._update_count

    def close(self):
        self._aim_db.close()
예제 #6
0
class Collector(threading.Thread):
    def __init__(self, tab_urls):
        super(Collector, self).__init__()
        self._lock = threading.RLock()

        self._db = MongoDB()
        self._thread_stop = False
        self._urls = []
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = int(
            tools.get_conf_value('config.conf', "collector", "depth"))
        self._max_size = int(
            tools.get_conf_value('config.conf', "collector", "max_size"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        #初始时将正在做的任务至为未做
        self._db.update(self._tab_urls, {'status': Constance.DOING},
                        {'status': Constance.TODO})

        self._finished_callback = None

    def run(self):
        while not self._thread_stop:
            self.__input_data()
            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True
        if self._finished_callback:
            self._finished_callback()

    # @tools.log_function_time
    def __input_data(self):
        if len(self._urls) > self._url_count:
            return

        urls_list = []
        if self._depth:
            urls_list = self._db.find(self._tab_urls, {
                "status": Constance.TODO,
                "depth": {
                    "$lte": self._depth
                }
            },
                                      limit=self._url_count)
        else:
            urls_list = self._db.find(self._tab_urls,
                                      {"status": Constance.TODO},
                                      limit=self._url_count)

        #更新已取到的url状态为doing
        for url in urls_list:
            self._db.update(self._tab_urls, url, {'status': Constance.DOING})

        # 存url
        self.put_urls(urls_list)

        if self.is_all_have_done():
            print('is_all_have_done')
            self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        print('判断是否有未做的url ')
        if len(self._urls) == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times:
                #检查数据库中有没有正在做的url
                urls_doing = self._db.find(self._tab_urls,
                                           {'status': Constance.DOING})
                if urls_doing:  # 如果有未做的url 且数量有变化,说明没有卡死
                    print('有未做的url %s' % len(urls_doing))
                    self._null_times = 0
                    return False
                else:
                    return True
            else:
                return False
        else:
            self._null_times = 0
            return False

    # @tools.log_function_time
    def put_urls(self, urls_list):
        self._urls.extend(urls_list)

    # @tools.log_function_time
    def get_urls(self, count):
        self._lock.acquire()  #加锁

        urls = self._urls[:count]
        del self._urls[:count]

        self._lock.release()

        return urls