def get_url_timestamp(self, url): """ Get timestamp of last <url> visit """ site_visit_info = self._visited_sites.get(get_netlock(url), None) if not site_visit_info: return -1 return site_visit_info.visited_urls.get(url, -1)
def is_visited(self, url): """ Check whether visit history contains <url> """ site_url = get_netlock(url) if not self._visited_sites.has_key(site_url): return None siteVisitInfo = self._visited_sites[site_url] if siteVisitInfo: return siteVisitInfo.visited_urls.get(url)
def set_visited(self, url, weight = 0, timestamp = None ): """ add <url> to page visit history """ if not timestamp: timestamp = time.time() site_url = get_netlock(url) if site_url not in self._visited_sites: site_visit_info = SiteVisitInfo() self._visited_sites[site_url] = site_visit_info else: site_visit_info = self._visited_sites[site_url] site_visit_info.submit_url(url, weight, timestamp)
def _max_avg_weight(self, link_queue): """ Returns netloc of the site, which has the highest average weight. """ weights = {} for w,l in link_queue._queue: netloc = get_netlock(l) sum_w, count = weights.get(netloc, (0, 0)) weights[netloc] = sum_w + w, count + 1 max_aw = None best = None t = None try: for t in weights.items(): nl, (sum_w, count) = t if sum_w / count > max_aw: max_aw = sum_w / count best = nl except Exception: print 'fail' return best
def _main_proc(self): """ Main procedure: get [(link_url, weight)] from ResultFilter and pass best links to PageLoader """ if self._stage_ttl >= 0: #main stage #fill queues: match_function = lambda x: get_netlock(x.user_data) == self._current_site or self._current_site == '' res = self._result_filter.get_result(match_function) if res: page_url, weight, filtered_links = res if not self._current_site: self._current_site = get_netlock(page_url) self._visit_history.set_visited(page_url, weight) for link, weight in filtered_links: netloc = get_netlock(link) if netloc == self._current_site: if self._visit_history.is_visited(link): continue self._current_site_queue.put((weight,link), True) else: if netloc != self._current_site and self._visit_history.is_visited_site(link): continue self._future_site_queue.put((weight,link), True) #assign PL tasks while self._page_loader.have_slots() and not self._current_site_queue.is_empty(): task = PageLoaderTask(self._current_site_queue.get()) self._page_loader.add_task(task) # if self._stage_ttl < 0: #switching stages self._page_loader.purge_tasks(lambda x: x.user_data != self._current_site) self._ranker.purge_tasks(lambda x: get_netlock(x.user_data) != self._current_site) self._result_filter.purge_tasks(lambda x: get_netlock(x.user_data) != self._current_site) best_nl = self._max_avg_weight(self._future_site_queue) if best_nl: filter_function = lambda x: get_netlock(x[1]) != best_nl self._current_site = best_nl self._current_site_queue._queue = self._future_site_queue.filter(filter_function) self._stage_ttl = self._site_crawl_quota # self._continue_work(1)
def is_visited_site(self, url): return self._visited_sites.get(get_netlock(url), None)