def isCrawlable(url): # use naive method to get root for given url url = CheckUrl.validifyUrl(url) strs = url.split('/') if len(strs) > 2: url = strs[0] + "//" + strs[2] robotUrl = url + "/robots.txt" if robotHash.get(robotUrl) == None: rerp = RobotExclusionRulesParser.RobotExclusionRulesParser() try: rerp.fetch(robotUrl,3) except urllib2.URLError as e: return False if rerp.is_allowed("*",url): return True else: return False else: rerp = robotHash[robotUrl] if rerp.is_allowed("*", url): return True else: return False
def Queue_Check_Push_Front(page): """Check the url and push into queue. If check is not needed, push into queue directly. :param page: { url: the url of each page, depth: the depth of each page, i.e., its minimum distance from one of the 10 start pages } """ global number_collected_url global pagesNumber if len(queue) + number_collected_url > pagesNumber * 1.5: return href = page["url"] global hash_table global number_visited_url href = CheckUrl.checkUrl(href) if href != -1: if CheckSite.checkSite_Visitable(href) == 1: if not hash_table.has_key(href): print "queue push front: " + str(len(queue)) + " " + href queue.append(page) hash_table[href] = number_visited_url number_visited_url += 1
def main(argv): try: print "Loading PKGs list ..." if argv[1] == "-Syu": p = subprocess.Popen(["pacman -Syup | tac | head -n -5"], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) elif argv[1] == "-S" and len(argv) > 2: p = subprocess.Popen(["pacman -Sp "+ argv[2]], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) elif len(argv) > 2: p = subprocess.Popen(["pacman "+ argv[1] +" "+ argv[2]], shell=True, stderr=subprocess.STDOUT) exit() else: p = subprocess.Popen(["pacman "+ argv[1]], shell=True, stderr=subprocess.STDOUT) exit() for line in p.stdout.readlines(): if CheckUrl.checkURL(line) == 0: return print "You must download : " + convert_bytes(CheckUrl.DownloadSize[0]) if YesNoQ.query_yes_no("Proceed with installation? "): Runner(CheckUrl.DownloadList) while threading.activeCount() > 1: time.sleep(1) else: print "BYE BYE :D" exit(1) if len(sys.argv) == 2: pacman = subprocess.Popen(["pacman "+ argv[1]], shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE) elif len(sys.argv) > 2: pacman = subprocess.Popen(["pacman -S "+ argv[2]], shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE) else: print bcolors.FAIL + "invalid args!" + bcolors.ENDC exit(1) stdout_data = pacman.communicate(input='')[0] print bcolors.OKBLUE + "FIN! ;) (PA|< Man)" + bcolors.ENDC except ValueError: print bcolors.FAIL + "What happend? Report it ([email protected])" + bcolors.ENDC
def isCrawlable(url): # use naive method to get root for given url url = CheckUrl.validifyUrl(url) strs = url.split('/') if len(strs) > 2: url = strs[0] + "//" + strs[2] robotUrl = url + "/robots.txt" if robotHash.get(robotUrl) == None: rerp = RobotExclusionRulesParser.RobotExclusionRulesParser() try: rerp.fetch(robotUrl, 3) except urllib2.URLError as e: return False if rerp.is_allowed("*", url): return True else: return False else: rerp = robotHash[robotUrl] if rerp.is_allowed("*", url): return True else: return False
def processUrl(self, href): """ :param href: Current url to be processed. if this href is not in dict, so we just push it in; else this href is already in dict, we must compute new score for it, In this way, I just give average score for it. """ href = urlparse.urljoin(self.baseUrl, href) href = CheckUrl.validifyUrl(href) if not href == -1: # not in dict if not dict.get(href) == None: #heapq.heappush(queue, [self.score, href]) if not queue.get(href) == None: queue[href] = queue[href] + self.score else : # in dict ,first find that url, then compute new score, and heapify it again. queue[href] = self.score dict[href] = self.score
this is used to computer priority score of that page, use naive method compute the number of keywords in the content of that page """ content = content.lower() content = content.split() priorityScore = 0 for keyword in keywords: for word in content: if keyword == word: priorityScore += 1 return priorityScore queryUrl = "https://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=" + urllib.quote(query) response1 = urllib2.urlopen(queryUrl + "&rsz=8") for result in json.load(response1)['responseData']['results']: url = CheckUrl.validifyUrl(result['unescapedUrl']) queue[url] = -1000 # put them into heap, [-1000, url] represents the score is -1000, and url is url. dict[url] = -1000 response1.close() response2 = urllib2.urlopen(queryUrl + "&rsz=2&start=8") for result in json.load(response2)['responseData']['results']: url = CheckUrl.validifyUrl(result['unescapedUrl']) #heapq.heappush(queue, [-1000,url]) queue[url] = -1000 """ because heapq in python is small root based, so using negative num can make it big-root-based. """ dict[url] = -1000