def extractKeywords(self, entityDescription): """ Retrieve the set of keywords for the entity description """ keywords = set([]) # Fill the URL template url = self.apiUrl.replace("####", entityDescription) url = url.replace(" ", "%20") url = url.replace("AT&T", "") # Blows up Yahoo API for some reason... # Get the keyword data, from the cache if possible keywordJson = self.cache.read(url) if keywordJson is None: keywordJson = loadFromUrl(url) self.cache.write(url, keywordJson) keywordData = loads(keywordJson) try: fetchedKeywords = keywordData["query"]["results"].values() keywords = keywords.union(set(fetchedKeywords[0])) except AttributeError: pass return list(keywords)
def getKeywordsFromContent(self, content): """ Retrieve the Yahoo keywords from some content, by splitting the content and joining the results if the content is too long to be sent via a URL. @param content The content from which to retrieve the keyword information. """ # Split the content content = self.__cleanResults(content) contentChunks = self.__group(content, self.contentSize) # Get the keywords for each chunk keywords = set([]) for chunk in contentChunks: # Fill the URL template url = self.apiUrl.replace('####', chunk) url = url.replace(' ', '%20') # Get the keyword data, from the cache if possible keywordJson = self.cache.read(url) if keywordJson is None: keywordJson = loadFromUrl(url) self.cache.write(url, keywordJson) keywordData = loads(keywordJson) try: fetchedKeywords = keywordData['query']['results'].values() keywords = keywords.union(set(fetchedKeywords[0])) except AttributeError: pass return list(keywords)
def run(self): """ Parse the content of this page, and update the given dictionary for this thread """ try: # Get the content from this page print "Getting page content for '%s'" % self.url.strip() filename = self.__encodeCacheFilename(self.url) if not os.path.exists(filename): try: content = loadFromUrl(self.url) except ValueError: content = None print "Error with URL: " + self.url # Extract the content from this page if content is not None and isHTML(content): self.resultDictionary['content'] = content # Get the information about this url content = content.lower() if self.saveData: try: title, keywords, description = parseMetaDataFromContent(content) pageRank = self.prCache.getPageRank(self.url) headers = parseHeaderInformationFromContent(content) # Get the YQL keywords for this DMOZ document try: yqlKeywordsExtension = YQLKeywordExtension() yqlKeywords = yqlKeywordsExtension.getKeywordsFromContent(content) except Exception: yqlKeywords = [] # Store the extra data self.resultDictionary['keywords'] = keywords self.resultDictionary['headers'] = headers self.resultDictionary['description'] = description self.resultDictionary['yqlKeywords'] = yqlKeywords self.resultDictionary['pageRank'] = pageRank self.resultDictionary['title'] = title # Save the result file dump(self.resultDictionary, open(filename, 'w')) except UnicodeDecodeError: print "Failed to save DMOZ document: " + self.url except URLError: print("Error accessing '%s', %s" % (self.url.strip(), str(sys.exc_info()[1]).strip()))