def check(self,url): self.rejectreason = None # normalize URL norm_url = url_normalization.get_canonical_url(url) # deal with URL code if isinstance(norm_url,unicode): norm_url = norm_url.encode('utf8') try: norm_url.decode('utf8') except: print 'cannot decode norm_url: %s' % norm_url self.rejectreason = 'failtodecode' return False # apply urllengthrule if not self.urllengthrule(url): self.rejectreason = 'urllengthrule' return False # apply blacklistrule if not self.blacklistrule(url): self.rejectreason = 'blacklistrule' return False # return true return True
def check(self, url): self.rejectreason = None # normalize URL norm_url = url_normalization.get_canonical_url(url) # deal with URL code if isinstance(norm_url, unicode): norm_url = norm_url.encode('utf8') try: norm_url.decode('utf8') except: print 'cannot decode norm_url: %s' % norm_url self.rejectreason = 'failtodecode' return False # apply urllengthrule if not self.urllengthrule(url): self.rejectreason = 'urllengthrule' return False # apply blacklistrule if not self.blacklistrule(url): self.rejectreason = 'blacklistrule' return False # return true return True
def __init__(self, code, parent_url, url, is_seed, hop, batch=0, anchor_text=''): self.code = code if parent_url != None: norm_parent_url = url_normalization.get_canonical_url(parent_url) if norm_parent_url == '': print 'parent_url=',parent_url raise BadResourceError, 'norm_parent_url is empty string. Invalid Parent URL' if isinstance(norm_parent_url, unicode): norm_parent_url = norm_parent_url.encode('utf8') try: norm_parent_url.decode('utf8') except: logging.warning('resource.py >> cannot decode norm_parent_url: %s' % norm_parent_url) self.parent_md5 = hashlib.md5(norm_parent_url).hexdigest() else: norm_parent_url = None self.parent_md5 = None norm_url = url_normalization.get_canonical_url(url) if norm_url == '': raise BadResourceError, 'Invalid URL' if isinstance(norm_url, unicode): norm_url = norm_url.encode('utf8') try: norm_url.decode('utf8') except: logging.warning('resource.py >> cannot decode norm_url: %s' % norm_url) self.parent_url = norm_parent_url self.url = norm_url self.is_seed = is_seed self.hop = hop self.batch = batch self.scheme, self.host, self.path, self.query, fragment = urlparse.urlsplit(norm_url) if self.host.find(':') != -1: parts = self.host.split(':') self.host = parts[0] self.port = parts[1] else: self.port = None # self.path always starts with '/' since http://www.aaa.com will # always be normalized to http://www.aaa.com/ self.segments = self.path.split('/')[1:] # http://www.cse.psu.edu/ Depth 1 # http://www.cse.psu.edu/index.php Depth 1 # http://www.cse.psu.edu/~shzheng/index.htm Depth 2 self.depth = len(self.segments) self.md5 = hashlib.md5(norm_url).hexdigest() # if no code is assigned, use MD5 as default code if code == None or code == '': self.code = self.md5 self.anchor_text = anchor_text self.crawl_date = datetime.now()# default time is time to add to db self.last_modified = None self.content_type = None self.content_length = -1 self.charset = None self.from_cache = False self.html = None self.content_sha1 = hashlib.sha1(self.url).hexdigest() #None self.filtered_by = None self.no_fetch = False self.ext = '.pdf' # by default, extension is .pdf