def set_parents_for_nodes(urltree): """ parent-find-algorithm on basis of url """ for node in URLNode.objects.filter(urltree=urltree, checked=True, parent=None): candidate = node.url logger.debug('orphan-node: %s') while (True): candidate = candidate[0:candidate.rstrip('/').rfind('/')+1] # possible caveat logger.debug('ParentCandidate: ' + candidate) if candidate.lower() == 'http://': break try: parent = URLNode.objects.get(url=candidate, urltree=urltree) except URLNode.DoesNotExist: continue logger.info("found parent: %s for %s" % (parent, node.url)) node.parent=parent node.save() break
def process_nodes(request, domain): """ prints to screen with simple ajax-polling """ # timeouts timeouts = 0 logger.debug('timeouts: %s' % timeouts) # urltree try: urltree = URLTree.objects.get(domain=domain) # one tree per domain except URLTree.DoesNotExist: logger.error('Missing URLTree. try build before.') return {'value': 'Missing URLTree. try /urltree/build before or go to admin.',} # root root_url = 'http://%s' % domain root_url = root_url.rstrip('/')+'/' rootNode = URLNode.objects.get_or_create(urltree=urltree, url=root_url)[0] logger.debug("rootNode: %s" % rootNode) # links unchecked_links = list(URLNode.objects.filter(urltree=urltree, checked=False).values_list('url', flat=True)) checked_links = URLNode.objects.filter(urltree=urltree, checked=True).values_list('url', flat=True) #crawl for link in unchecked_links: crawled_links = [] logger.info('processing: %s' % link) try: url = urllib2.urlopen(link, timeout=MAX_TIMEOUT_TIME) except urllib2.URLError, e: timeouts += 1 logger.warning("%s: %s" % (str(timeouts), escape(str(e)))) if timeouts > MAX_TIMEOUTS: logger.error("BREAK because of too many errors: try later to check the other links") break continue # check for media (no download - only for existence) urlinfo = url.info() if urlinfo.type != 'text/html': logger.info("media: %s" % link) continue # parse for links src = url.read() bs = BeautifulSoup(src) for a_elem in bs.findAll('a', {'href':True}): absUrl = urlparse.urljoin(link, a_elem['href']) parsedUrl = urlparse.urlparse(absUrl) logger.debug('href: %s | absURL: %s | parsedUrl: %s' % (a_elem['href'], absUrl, parsedUrl)) hostUrl = get_host(absUrl) absUrl = urlparse.urlunparse((parsedUrl.scheme, hostUrl, parsedUrl.path, parsedUrl.params, parsedUrl.query, parsedUrl.fragment)) logger.debug('absURL: %s | parsedUrl: %s' % (absUrl, parsedUrl)) if (parsedUrl.scheme == 'http') and ( # http for crawling parsedUrl.netloc == domain) or ( # URL must be internal parsedUrl.netloc.endswith('.' + domain)): # or subdomain if absUrl not in crawled_links and ( absUrl not in unchecked_links) and ( absUrl not in checked_links): unchecked_links.append(absUrl) # if link is new append for crawling
def get_host(url): parsedUrl = urlparse.urlparse(url) hostUrl = parsedUrl.netloc logger.debug('hostUrl: %s ' % hostUrl) return hostUrl