def __get_anchors(self): if not hasattr(self, '__anchors'): if not self.context: self.__anchors = self.raw_anchors else: self.__anchors = list() for anchor in self.raw_anchors: if anchor.startswith('http://') or anchor.startswith( 'https://'): self.__anchors.append(anchor) continue if '../' in anchor: # TODO Process relative anchor and continue continue if anchor.startswith('/'): uri_scheme, authority, port, path = normalize_url( self.context) anchor = "{0}{1}{2}".format(uri_scheme, authority, anchor) self.__anchors.append(anchor) continue if not anchor.startswith('/'): uri_scheme, authority, port, path = normalize_url( self.context) pieces = path[1:].split('/') pieces.pop() if len(pieces) > 0: path = '/'.join(pieces) if len(pieces) > 1: path += '/' else: path = '' anchor = "{0}{1}/{2}{3}".format( uri_scheme, authority, path, anchor) self.__anchors.append(anchor) continue return self.__anchors
def __get_anchors(self): if not hasattr(self, '__anchors'): if not self.context: self.__anchors = self.raw_anchors else: self.__anchors = list() for anchor in self.raw_anchors: if anchor.startswith('http://') or anchor.startswith('https://'): self.__anchors.append(anchor) continue if '../' in anchor: # TODO Process relative anchor and continue continue if anchor.startswith('/'): uri_scheme, authority, port, path = normalize_url(self.context) anchor = "{0}{1}{2}".format(uri_scheme, authority, anchor) self.__anchors.append(anchor) continue if not anchor.startswith('/'): uri_scheme, authority, port, path = normalize_url(self.context) pieces = path[1:].split('/'); pieces.pop() if len(pieces) > 0: path = '/'.join(pieces) if len(pieces) > 1: path += '/' else: path = '' anchor = "{0}{1}/{2}{3}".format(uri_scheme, authority, path, anchor) self.__anchors.append(anchor) continue return self.__anchors
def can_access(url): from crawler.connection import normalize_url uri_scheme, authority, port, path = normalize_url(url) del uri_scheme, port robot = Robot(authority) value = robot.can_access(path) del robot return value