示例#1
0
 def Reload(self, filename):
     if filename == "":
         self.program = Program()
         self.schedule = Schedule(self.program, [Processor(0, 0.9, 5)])
         return
     self.program = Program(filename)
     self.LoadProcessors(filename)
     # TODO: exception in case of any errors
     self.schedule = Schedule(self.program, self.processors)
     self.schedule.SetToDefault()
示例#2
0
    def __init__(self, v, k, m):
        self.v = v

        if k.__class__.__name__ == "Version":
            self.k = k
        else:
            self.k = Version(v, k)

        if m.__class__.__name__ == "Processor":
            self.m = m
        else:
            self.m = Processor(m)
示例#3
0
    def _getProcessor(self, proc=None):
        if not proc is None:
            # TODO: check errors
            ind = self.emptyprocessors.index(proc)
            m = self.emptyprocessors[ind]
            del self.emptyprocessors[ind]
            self.processors.append(m)
            return m

        if len(self.emptyprocessors) > 0:
            p = self.emptyprocessors[0]
            self.emptyprocessors = self.emptyprocessors[1:]
        else:
            p = Processor(self.GetProcessorsWithoutDoubles() + 1, \
                          self.availableProcessors[0].reliability, \
                          self.availableProcessors[0].speed)
        self.processors.append(p)
        return p
示例#4
0
    def LoadProcessors(self, filename):
        '''Parse the XML with to get the specs of the processors
        
        .. warning:: Describe XML format here'''
        f = open(filename, "r")
        dom = xml.dom.minidom.parse(f)

        for node in dom.childNodes:
            if node.tagName == "program":
                self.tdir = int(node.getAttribute("tdir"))
                self.rdir = float(node.getAttribute("rdir"))
                #Parse vertices
                for vertex in node.childNodes:
                    if vertex.nodeName == "processor":
                        speed = int(vertex.getAttribute("speed"))
                        rel = float(vertex.getAttribute("reliability"))
                        p = Processor(0, rel, speed)
                        self.processors.append(p)
        f.close()
示例#5
0
    def Randomize(self):
        '''Make a random schedule'''
        #TODO: randomize versions and processors too
        self.vertices = {}
        self.processors = []
        self.emptyprocessors = []
        count = random.randint(1, len(self.program.vertices))
        keys = []
        procs = {}
        for i in range(count):
            p = self._getProcessor()
            self.vertices[p.number] = []
            keys.append(p.number)
            procs[p.number] = p

        # Use fictional processor number -1 to check correctness of the schedule.
        fict = Processor(-1)
        self.vertices[-1] = []
        verts = self.program.OrderedVertices()
        backup = [[v for v in self.program.vertices],
                  [e for e in self.program.edges]]
        self.program.vertices = []
        self.program.edges = []
        for v in verts:
            self.program.vertices.append(v)
            self.program.edges = []
            for e in backup[1]:
                if e.source in self.program.vertices and e.destination in self.program.vertices:
                    self.program.edges.append(e)
            self.program._buildData()
            s = ScheduleVertex(v, v.versions[0], fict)
            self.currentVersions[v.number] = [s]
            self.vertices[-1] = [s]
            self._succCache = {}
            while True:
                m = random.randint(1, count)
                n = random.randint(0, len(self.vertices[m]))
                if self.TryMoveVertex(s, 0, procs[m], n) == True:
                    self.MoveVertex(s, 0, procs[m], n)
                    break
        for m in self.processors:
            self._delEmptyProc(m)
示例#6
0
    def start_spider_core(self):
        if log.isEnabledFor(logging.INFO):
            log.info('Spider 开始启动')

        try:
            # 创建Redis连接
            redis_connect_retry_times = 3
            while redis_connect_retry_times > 0:
                self.redis_connection = redis.StrictRedis(
                    host=self.redis_host,
                    port=self.redis_port,
                    db=self.redis_db,
                    password=self.redis_password)
                ping = self.redis_connection.ping()
                if ping is True:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接成功')
                    break
                else:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接失败')
                    redis_connect_retry_times -= 1
                    time.sleep(5)

            # 若连接不成功则退出
            if redis_connect_retry_times <= 0:
                raise Exception()

            # 创建MySQL连接
            self.mysql_connection = pymysql.connect(host=self.mysql_host,
                                                    user=self.mysql_username,
                                                    passwd=self.mysql_password,
                                                    db=self.mysql_database,
                                                    charset=self.mysql_charset)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Redis 启动失败')
                log.exception(e)
            return

        # 创建 response 缓存队列
        self.response_buffer = ResponseBuffer()

        # 启动账户管理器并登陆
        self.account_manager = AccountManager(self.login_token, self.password,
                                              self.is_login_by_cookie,
                                              self.z_c0)
        is_login = self.account_manager.login()
        if not is_login:
            return

        # 启动Downloader
        self.downloader = Downloader(
            self.redis_connection, self.response_buffer, self.account_manager,
            self.is_proxy_service_enable, self.session_pool_size,
            self.download_thread_num, self.network_retry_times,
            self.connect_timeout, self.download_interval)
        self.downloader.start_downloader()

        # 启动Scheduler
        self.schedule = Scheduler(self.redis_connection, self.url_rate)
        self.schedule.start()

        # 启动 DataPersistent
        self.dataPersistent = DataPersistent(
            self.persistent_cache_size,
            self.follow_relation_persistent_cache_size, self.mysql_connection,
            self.redis_connection)
        self.dataPersistent.start_data_persistent()

        # 启动Processor
        self.processor = Processor(self.process_thread_num,
                                   self.is_parser_following_list,
                                   self.is_parser_follower_list,
                                   self.is_parser_follow_relation,
                                   self.redis_connection, self.response_buffer)
        self.processor.start_processor()
        self.processor.load_init_data(self.init_token)

        # 启动邮件服务
        if self.is_email_service_enable is True:
            self.email_service = EmailService(
                self.smtp_server_host, self.smtp_server_port,
                self.smtp_server_password, self.smtp_from_addr,
                self.smtp_to_addr, self.smtp_email_header,
                self.smtp_send_interval, self.dataPersistent)
            self.email_service.start_email_service()
            self.email_service.send_message('Spider 启动完毕')

        if log.isEnabledFor(logging.INFO):
            log.info('Spider 启动完毕')

        # 模块异常检查
        while True:
            # Downloader模块异常检查
            self.downloader.check_and_restart()
            # EmailService 模块异常检查
            if self.is_email_service_enable is True:
                self.email_service.check_and_restart()
            # DataPersistent 模块异常检查
            self.dataPersistent.check_and_restart()
            # Scheduler 模块异常检查
            # Processor 模块异常检查
            self.processor.check_and_restart()
            # 检查间隔
            time.sleep(180)
            gc.collect()
示例#7
0
class SpiderCore:
    def __init__(self):
        # 默认配置
        # downloader 模块配置
        self.is_proxy_service_enable = False
        self.session_pool_size = 20
        self.download_thread_num = 10
        self.network_retry_times = 3
        self.connect_timeout = 30
        self.download_interval = 3

        # Processor 模块配置
        self.process_thread_num = 2
        self.is_parser_following_list = True
        self.is_parser_follower_list = False
        self.is_parser_follow_relation = False

        # Scheduler 模块配置
        self.url_rate = 8

        # DataPersistent 模块配置
        self.persistent_cache_size = 1000
        self.follow_relation_persistent_cache_size = 1000

        # 邮件服务配置
        self.is_email_service_enable = False
        self.smtp_server_host = ''
        self.smtp_server_port = 25
        self.smtp_server_password = ''
        self.smtp_from_addr = ''
        self.smtp_to_addr = ''
        self.smtp_email_header = ''
        self.smtp_send_interval = 3600

        # redis 数据库配置
        self.redis_host = ''
        self.redis_port = 6379
        self.redis_db = 0
        self.redis_password = ''

        # MySQL 数据库配置
        self.mysql_host = ''
        self.mysql_username = ''
        self.mysql_password = ''
        self.mysql_database = ''
        self.mysql_charset = 'utf8'

        # 知乎账户配置
        self.is_login_by_cookie = True
        self.z_c0 = ''
        self.login_token = ''
        self.password = ''

        # 初始 Token
        self.init_token = []

        # 载入用户自定义配置
        self.load_config()

        # 模块实例
        self.redis_connection = None
        self.mysql_connection = None
        self.response_buffer = None
        self.account_manager = None
        self.downloader = None
        self.processor = None
        self.schedule = None
        self.dataPersistent = None
        self.email_service = None

    # 启动Spider
    def start_spider_core(self):
        if log.isEnabledFor(logging.INFO):
            log.info('Spider 开始启动')

        try:
            # 创建Redis连接
            redis_connect_retry_times = 3
            while redis_connect_retry_times > 0:
                self.redis_connection = redis.StrictRedis(
                    host=self.redis_host,
                    port=self.redis_port,
                    db=self.redis_db,
                    password=self.redis_password)
                ping = self.redis_connection.ping()
                if ping is True:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接成功')
                    break
                else:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接失败')
                    redis_connect_retry_times -= 1
                    time.sleep(5)

            # 若连接不成功则退出
            if redis_connect_retry_times <= 0:
                raise Exception()

            # 创建MySQL连接
            self.mysql_connection = pymysql.connect(host=self.mysql_host,
                                                    user=self.mysql_username,
                                                    passwd=self.mysql_password,
                                                    db=self.mysql_database,
                                                    charset=self.mysql_charset)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Redis 启动失败')
                log.exception(e)
            return

        # 创建 response 缓存队列
        self.response_buffer = ResponseBuffer()

        # 启动账户管理器并登陆
        self.account_manager = AccountManager(self.login_token, self.password,
                                              self.is_login_by_cookie,
                                              self.z_c0)
        is_login = self.account_manager.login()
        if not is_login:
            return

        # 启动Downloader
        self.downloader = Downloader(
            self.redis_connection, self.response_buffer, self.account_manager,
            self.is_proxy_service_enable, self.session_pool_size,
            self.download_thread_num, self.network_retry_times,
            self.connect_timeout, self.download_interval)
        self.downloader.start_downloader()

        # 启动Scheduler
        self.schedule = Scheduler(self.redis_connection, self.url_rate)
        self.schedule.start()

        # 启动 DataPersistent
        self.dataPersistent = DataPersistent(
            self.persistent_cache_size,
            self.follow_relation_persistent_cache_size, self.mysql_connection,
            self.redis_connection)
        self.dataPersistent.start_data_persistent()

        # 启动Processor
        self.processor = Processor(self.process_thread_num,
                                   self.is_parser_following_list,
                                   self.is_parser_follower_list,
                                   self.is_parser_follow_relation,
                                   self.redis_connection, self.response_buffer)
        self.processor.start_processor()
        self.processor.load_init_data(self.init_token)

        # 启动邮件服务
        if self.is_email_service_enable is True:
            self.email_service = EmailService(
                self.smtp_server_host, self.smtp_server_port,
                self.smtp_server_password, self.smtp_from_addr,
                self.smtp_to_addr, self.smtp_email_header,
                self.smtp_send_interval, self.dataPersistent)
            self.email_service.start_email_service()
            self.email_service.send_message('Spider 启动完毕')

        if log.isEnabledFor(logging.INFO):
            log.info('Spider 启动完毕')

        # 模块异常检查
        while True:
            # Downloader模块异常检查
            self.downloader.check_and_restart()
            # EmailService 模块异常检查
            if self.is_email_service_enable is True:
                self.email_service.check_and_restart()
            # DataPersistent 模块异常检查
            self.dataPersistent.check_and_restart()
            # Scheduler 模块异常检查
            # Processor 模块异常检查
            self.processor.check_and_restart()
            # 检查间隔
            time.sleep(180)
            gc.collect()

    # 加载自定义配置信息
    def load_config(self):
        section = "spider_core"
        config = configparser.ConfigParser()
        config.read("Core/Config/SpiderCoreConfig.conf", encoding="utf8")

        # 读取 downloader 模块配置
        self.is_proxy_service_enable = True if int(
            config.get(section, 'isProxyServiceEnable')) == 1 else False
        self.session_pool_size = int(config.get(section, 'sessionPoolSize'))
        self.download_thread_num = int(config.get(section,
                                                  'downloadThreadNum'))
        self.network_retry_times = int(config.get(section,
                                                  'networkRetryTimes'))
        self.connect_timeout = int(config.get(section, 'connectTimeout'))
        self.download_interval = int(config.get(section, 'downloadInterval'))

        # 读取 Processor 模块配置
        self.process_thread_num = int(config.get(section, 'processThreadNum'))
        self.is_parser_following_list = True if int(
            config.get(section, 'isParserFollowingList')) == 1 else False
        self.is_parser_follower_list = True if int(
            config.get(section, 'isParserFollowerList')) == 1 else False
        self.is_parser_follow_relation = True if int(
            config.get(section, 'isParserFollowRelation')) == 1 else False

        # 读取 Scheduler 模块配置
        self.url_rate = int(config.get(section, 'urlRate'))

        # 读取 DataPersistent 模块配置
        self.persistent_cache_size = int(
            config.get(section, 'persistentCacheSize'))
        self.follow_relation_persistent_cache_size = int(
            config.get(section, 'followRelationPersistentCacheSize'))

        # 读取邮件服务配置
        self.is_email_service_enable = True if int(
            config.get(section, 'isEmailServiceEnable')) == 1 else False
        self.smtp_server_host = config.get(section, 'smtpServerHost')
        self.smtp_server_port = int(config.get(section, 'smtpServerPort'))
        self.smtp_server_password = config.get(section, 'smtpServerPassword')
        self.smtp_from_addr = config.get(section, 'smtpFromAddr')
        self.smtp_to_addr = config.get(section, 'smtpToAddr')
        self.smtp_email_header = config.get(section, 'smtpEmailHeader')
        self.smtp_send_interval = int(config.get(section, 'smtpSendInterval'))

        # 读取 Redis 数据库配置
        self.redis_host = config.get(section, 'redisHost')
        self.redis_port = int(config.get(section, 'redisPort'))
        self.redis_db = int(config.get(section, 'redisDB'))
        self.redis_password = config.get(section, 'redisPassword')

        # 读取 MySQL 数据库配置
        self.mysql_host = config.get(section, 'mysqlHost')
        self.mysql_username = config.get(section, 'mysqlUsername')
        self.mysql_password = config.get(section, 'mysqlPassword')
        self.mysql_database = config.get(section, 'mysqlDatabase')
        self.mysql_charset = config.get(section, 'mysqlCharset')

        # 读取知乎账户配置
        self.is_login_by_cookie = True if int(
            config.get(section, 'isLoginByCookie')) == 1 else False
        self.z_c0 = config.get(section, 'z_c0')
        self.login_token = config.get(section, 'loginToken')
        self.password = config.get(section, 'password')

        # 读取初始token
        token_list = config.get(section, 'initToken')
        for token in token_list.split(','):
            self.init_token.append(str(token).strip())

        if log.isEnabledFor(logging.INFO):
            log.info('配置文件读取并配置完毕')
示例#8
0
    def ReplaceProcessor(self, tasks):
        ''' Replaces the list of vertices on some processor with tasks, moving other vertices accordingly. 
        Used for crossover in genetic algorithm.'''
        oldverts = self.vertices
        ordered = self.program.OrderedVertices()
        self.processors = []
        self.emptyprocessors = []
        self.vertices = {}
        self.currentVersions = {}
        p = self._getProcessor()
        self.vertices[p.number] = []
        backup = [[v for v in self.program.vertices],
                  [e for e in self.program.edges]]
        self.program._buildData()
        #self.program.vertices = []
        #self.program.edges = []
        for t in tasks:
            s = ScheduleVertex(t.v, t.v.versions[0], p)
            self.vertices[p.number].append(s)
            self.currentVersions[t.v.number] = [s]
        for v in self.program.vertices:
            if not v.number in self.currentVersions:
                self.currentVersions[v.number] = []
        #    self.program.vertices.append(t.v)
        self.Consistency()
        #for e in backup[1]:
        #    if e.source in self.program.vertices and e.destination in self.program.vertices:
        #        self.program.edges.append(e)

        fict = Processor(-1)
        spare = self._getProcessor()
        self.vertices[-1] = []
        self.vertices[spare.number] = []
        newprocs = {}
        allverts = []
        for m in oldverts.keys():
            allverts += oldverts[m]
        for vp in ordered:
            for v in [t for t in allverts if t.v == vp]:
                if [t for t in tasks if t.v == v.v] == []:
                    if v.m in newprocs:
                        p = newprocs[v.m]
                    else:
                        p = self._getProcessor()
                        self.vertices[p.number] = []
                        newprocs[v.m] = p
                    i = oldverts[v.m].index(v)
                    #self.program.vertices.append(v.v)
                    #self.program.edges = []
                    #for e in backup[1]:
                    #    if e.source in self.program.vertices and e.destination in self.program.vertices:
                    #        self.program.edges.append(e)
                    #self.program._buildData()
                    s = ScheduleVertex(v.v, v.v.versions[0], fict)
                    self.currentVersions[v.v.number] = [s]
                    self.vertices[-1] = [s]
                    self._succCache = {}
                    if self.TryMoveVertex(s, 0, p, i) == True:
                        #print ("Applying operation 1", str(s), 0, p, i)
                        self.MoveVertex(s, 0, p, i)
                    else:
                        if len(self.vertices[spare.number]) == 0:
                            #print ("Applying operation 2", str(s), 0, spare, 0)
                            self.MoveVertex(s, 0, spare, 0)
                        else:
                            for j in range(
                                    len(self.vertices[spare.number]) + 1):
                                if self.TryMoveVertex(s, 0, spare, j) == True:
                                    #print ("Applying operation 3", str(s), 0, spare, j)
                                    self.MoveVertex(s, 0, spare, j)
                                    break
                    self.emptyprocessors = []
                    #print(self)
                    #print("++++++++++++")
        for m in self.processors:
            self._delEmptyProc(m)
        self.Consistency()