示例#1
0
 def from_seed(cls, seed):
     name = get_domain(seed.url)
     priority = seed.priority
     crawl_interval = seed.crawl_interval
     return cls(name=name,
                priority=priority,
                crawl_interval=crawl_interval,
                total_url_num=1)
示例#2
0
 def add_seed(self, seed):
     domain = get_domain(seed.url)
     hostinfo = self.hosts.get(domain)
     if hostinfo:
         hostinfo.total_url_num += 1
     else:
         self.hosts[domain] = HostInfo.from_seed(seed)
         log.msg('add new host:%s' % domain)
示例#3
0
 def add_seed(self, seed):
     domain = get_domain(seed.url)
     hostinfo = self.hosts.get(domain)
     if hostinfo:
         hostinfo.total_url_num += 1
     else:
         self.hosts[domain] = HostInfo.from_seed(seed)
         log.msg('add new host:%s' % domain)
示例#4
0
    def get_latency_time(self, url):
        domain = get_domain(url)
        hostinfo = self.hosts[domain]
        print hostinfo

        if hostinfo.last_crawl_time is None:
            hostinfo.last_crawl_time = time.time()
            return 0
        else:
            left_time = time.time() - hostinfo.last_crawl_time - \
                hostinfo.crawl_interval

            print "left: ", left_time
            if left_time > 0:
                hostinfo.last_crawl_time = time.time()
                return 0
            else:
                return -left_time
示例#5
0
    def get_latency_time(self, url):
        domain = get_domain(url) 
        hostinfo = self.hosts[domain]
        print hostinfo

        if hostinfo.last_crawl_time is None:
            hostinfo.last_crawl_time = time.time()
            return 0
        else:
            left_time = time.time() - hostinfo.last_crawl_time - \
                hostinfo.crawl_interval

            print "left: ", left_time
            if left_time > 0:
                hostinfo.last_crawl_time = time.time()
                return 0 
            else:
                return -left_time
示例#6
0
 def from_seed(cls, seed):
     name = get_domain(seed.url)
     priority = seed.priority
     crawl_interval = seed.crawl_interval
     return cls(name=name, priority=priority, 
         crawl_interval=crawl_interval, total_url_num=1)