def __init__(self, resultDictionary, saveData): threading.Thread.__init__(self) self.resultDictionary = resultDictionary self.url = resultDictionary['url'] # Find where we expect this data to be cached self.savePath = str(os.getcwd()) self.savePath = self.savePath[:self.savePath.find('EntityQuerier') + len('EntityQuerier')] + '/dmoz/' # Whether or not we should be saving the data to disk self.saveData = saveData # A cache for computing PR self.prCache = PRCache()
class DMOZCrawlerThread(threading.Thread): def __init__(self, resultDictionary, saveData): threading.Thread.__init__(self) self.resultDictionary = resultDictionary self.url = resultDictionary['url'] # Find where we expect this data to be cached self.savePath = str(os.getcwd()) self.savePath = self.savePath[:self.savePath.find('EntityQuerier') + len('EntityQuerier')] + '/dmoz/' # Whether or not we should be saving the data to disk self.saveData = saveData # A cache for computing PR self.prCache = PRCache() def run(self): """ Parse the content of this page, and update the given dictionary for this thread """ try: # Get the content from this page print "Getting page content for '%s'" % self.url.strip() filename = self.__encodeCacheFilename(self.url) if not os.path.exists(filename): try: content = loadFromUrl(self.url) except ValueError: content = None print "Error with URL: " + self.url # Extract the content from this page if content is not None and isHTML(content): self.resultDictionary['content'] = content # Get the information about this url content = content.lower() if self.saveData: try: title, keywords, description = parseMetaDataFromContent(content) pageRank = self.prCache.getPageRank(self.url) headers = parseHeaderInformationFromContent(content) # Get the YQL keywords for this DMOZ document try: yqlKeywordsExtension = YQLKeywordExtension() yqlKeywords = yqlKeywordsExtension.getKeywordsFromContent(content) except Exception: yqlKeywords = [] # Store the extra data self.resultDictionary['keywords'] = keywords self.resultDictionary['headers'] = headers self.resultDictionary['description'] = description self.resultDictionary['yqlKeywords'] = yqlKeywords self.resultDictionary['pageRank'] = pageRank self.resultDictionary['title'] = title # Save the result file dump(self.resultDictionary, open(filename, 'w')) except UnicodeDecodeError: print "Failed to save DMOZ document: " + self.url except URLError: print("Error accessing '%s', %s" % (self.url.strip(), str(sys.exc_info()[1]).strip())) def __encodeCacheFilename(self, url): """ Encode the URL to a filename to be stored in the cache """ try: hashedUrl = hashlib.sha256(url).hexdigest() except UnicodeDecodeError: hashedUrl = hashlib.sha256(url.decode(errors='ignore')).hexdigest() filename = self.savePath + hashedUrl return filename