def isCrawlable(url): # use naive method to get root for given url url = CheckUrl.validifyUrl(url) strs = url.split('/') if len(strs) > 2: url = strs[0] + "//" + strs[2] robotUrl = url + "/robots.txt" if robotHash.get(robotUrl) == None: rerp = RobotExclusionRulesParser.RobotExclusionRulesParser() try: rerp.fetch(robotUrl,3) except urllib2.URLError as e: return False if rerp.is_allowed("*",url): return True else: return False else: rerp = robotHash[robotUrl] if rerp.is_allowed("*", url): return True else: return False
def isCrawlable(url): # use naive method to get root for given url url = CheckUrl.validifyUrl(url) strs = url.split('/') if len(strs) > 2: url = strs[0] + "//" + strs[2] robotUrl = url + "/robots.txt" if robotHash.get(robotUrl) == None: rerp = RobotExclusionRulesParser.RobotExclusionRulesParser() try: rerp.fetch(robotUrl, 3) except urllib2.URLError as e: return False if rerp.is_allowed("*", url): return True else: return False else: rerp = robotHash[robotUrl] if rerp.is_allowed("*", url): return True else: return False
def processUrl(self, href): """ :param href: Current url to be processed. if this href is not in dict, so we just push it in; else this href is already in dict, we must compute new score for it, In this way, I just give average score for it. """ href = urlparse.urljoin(self.baseUrl, href) href = CheckUrl.validifyUrl(href) if not href == -1: # not in dict if not dict.get(href) == None: #heapq.heappush(queue, [self.score, href]) if not queue.get(href) == None: queue[href] = queue[href] + self.score else : # in dict ,first find that url, then compute new score, and heapify it again. queue[href] = self.score dict[href] = self.score
this is used to computer priority score of that page, use naive method compute the number of keywords in the content of that page """ content = content.lower() content = content.split() priorityScore = 0 for keyword in keywords: for word in content: if keyword == word: priorityScore += 1 return priorityScore queryUrl = "https://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=" + urllib.quote(query) response1 = urllib2.urlopen(queryUrl + "&rsz=8") for result in json.load(response1)['responseData']['results']: url = CheckUrl.validifyUrl(result['unescapedUrl']) queue[url] = -1000 # put them into heap, [-1000, url] represents the score is -1000, and url is url. dict[url] = -1000 response1.close() response2 = urllib2.urlopen(queryUrl + "&rsz=2&start=8") for result in json.load(response2)['responseData']['results']: url = CheckUrl.validifyUrl(result['unescapedUrl']) #heapq.heappush(queue, [-1000,url]) queue[url] = -1000 """ because heapq in python is small root based, so using negative num can make it big-root-based. """ dict[url] = -1000