def getTextTokens(self, removeSplitter=True, lemmatize=True): ''' Obtenemos las palabras a partir del texto ''' if not self.tokens: fileStorage = FileStorageFactory.getFileStorage( SeoDocument.CACHE_PATH) key = u'tokens_%s_%s_%s' % (self.link, self.language, self.country) self.tokens = fileStorage.get(key) if not self.tokens or not self.cache or settings.SCRAPER_RELOAD_CONTENT: self.tokens = self._getTextRawTokens() if self.cache: fileStorage.set(key, self.tokens) if lemmatize: tokens = self._getTextLemmaTokens(self.tokens, ngrade=1) else: tokens = self.tokens if removeSplitter: try: return [token for token in tokens if token not in SPLITTER] except Exception as ex: print(tokens) else: return tokens
def textReadabilityScore(self, text): md5Text = hashlib.md5(force_bytes(text)).hexdigest() fileStorage = FileStorageFactory.getFileStorage(ReadabilityText.CACHE_PATH) key = u'textReadabilityScore_%s_%s' % (self.language, md5Text) result = fileStorage.get(key) if not result or not settings.CACHE: result = self._textReadabilityScore(text) return result
def getProxies(self): fileStorage = FileStorageFactory.getFileStorage( ProxyBuyProxies.CACHE_PATH) key = u'_proxy' self.proxies = fileStorage.get(key) if not self.proxies or not settings.CACHE: self.proxies = self._getProxies() random.shuffle(self.proxies) self._saveProxies() return self.proxies
def getTokens(self, contexLimit=5, display=DISPLAY): if not self.details: fileStorage = FileStorageFactory.getFileStorage( DetailedTerms.CACHE_PATH) key = u'detailedTerms_%s_%s_%s' % (self.query, self.language, self.country) self.details = fileStorage.get(key) if not self.details or not settings.CACHE: self.details = self._getTerms(contexLimit) fileStorage.set(key, self.details) return self.details
def getBadWords(self, text): ''' Returns the result list, and also the uri for next page (returned_list, next_uri) ''' md5Text = hashlib.md5(force_bytes(text)).hexdigest() fileStorage = FileStorageFactory.getFileStorage( LaguageToolChecker.LANGUAGETOOLCHECKER_CACHE_PATH) key = u'languageToolChecker__%s_%s' % (md5Text, self.language_country) result = fileStorage.get(key) if not result or not settings.CACHE: result = self._getBadWords(text) fileStorage.set(key, result) return result
def getDataDocument(self): fileCache = FileStorageFactory.getFileStorage(Scraper.CACHE_PATH) dataDocument = fileCache.get(self.url) if dataDocument and settings.SCRAPER_RELOAD_CONTENT: self.rawHtml = dataDocument.rawHtml dataDocument = self._getDataDocument() fileCache.set(self.url, dataDocument) if not dataDocument or not settings.CACHE: dataDocument = self._getDataDocument() fileCache.set(self.url, dataDocument) return dataDocument
def getSentences(self): if not self.sentences: fileStorage = FileStorageFactory.getFileStorage( SeoDocument.CACHE_PATH) key = u'sentences_%s_%s_%s' % (self.link, self.language, self.country) self.sentences = fileStorage.get(key) if not self.sentences or not self.cache: self.sentences = nltk_utils.sentenceTokenizer( self.dataDocument.text.replace(SPLITTER_TAG, '.'), self.language) if self.cache: fileStorage.set(key, self.sentences) return self.sentences
def getTokens(self, window=2, minCount=5, ntotal=30, display=False): if not self.terms: fileStorage = FileStorageFactory.getFileStorage( ScoredTerms.CACHE_PATH) key = u'scoredTerms_%s_%s_%s' % (self.seoLibrary.query, self.seoLibrary.language, self.seoLibrary.country) self.terms = fileStorage.get(key) if not self.terms or not settings.CACHE: self.terms = self._getTerms(window, minCount, ntotal) fileStorage.set(key, self.terms) if display: for word, metric in self.terms.items(): app_logger.debug(u'%s --> %s' % (word, metric)) return self.terms
def search(self): fileStorage = FileStorageFactory.getFileStorage(GoogleScraperRelatedSelenium.CACHE_PATH) key = '%s.%s.%s.%s' % (self.query, self.language, self.country, self.max_results) related = fileStorage.get(key) if not related: related = [] try: related.extend(self._search(0)) except Exception as ex: app_logger.error(u"%s" % ex) if not related: raise Exception('Google Selenium Related Error') related = list(set(related)) fileStorage.set(key, related) return related
def snapshot(self): browser = None result = {} initTime = time.time() try: # ------------------------------------------------------ key = u'%s.%s.%s' % (self.url, self.width, self.height) imageUrl = self.imageFileStorage.get(key) if not imageUrl: screen_path = self.imageFileStorage._key_to_file(key) screen_path = screen_path.replace('djcache', 'jpg') self.imageFileStorage._createSubFolder( ) # create cache subfolder ret, browser = self._snapshot(screen_path) if ret: imageUrl = screen_path.replace( settings.SCREENCAPTURE_PATH, settings.SCREENCAPTURE_DOMAIN) # store in cache self.imageFileStorage.set(key, imageUrl) # ------------------------------------------------------ if self.processSelenium: fileStorage = FileStorageFactory.getFileStorage( ScreenCapture.SELENIUM_CACHE) key = u'seleniumCache_%s' % (self.url, ) result = fileStorage.get(key) if not result or not settings.CACHE: result, browser = self._processSelenium(browser) fileStorage.set(key, result) elapsedTime = time.time() - initTime result.update({ 'imageUrl': imageUrl, 'elapsedTime': elapsedTime, }) return result finally: if browser: browser.close()
def getData(topics, initLevel = settings.TRAINER_INIT_LEVEL, language='es', country='ES'): fileStorage = FileStorageFactory.getFileStorage(CLASSIFIER_DATA_PATH) key = 'trainerData_%s_%s_%s_%s_%s_%s_%s' % ( language, country, initLevel, settings.TRAINER_DOWNLOAD_PERCENTAGE, settings.TRAINER_DOWNLOADER_INTERVAL, settings.TRAINER_DOWNLOADER_PARTS, settings.TRAINER_TREE_TYPE ) result = fileStorage.get(key) if not result or not settings.CACHE or not settings.TRAINER_DOWNLOAD_DOCUMENTS or settings.SCRAPER_RELOAD_CONTENT: print('NO CACHE --- Generando trainer data... %s' % key) result = _getData(topics, initLevel, language, country) if settings.TRAINER_DOWNLOAD_DOCUMENTS: fileStorage.set(key, result) return result
def search(self, jump=True): fileStorage = FileStorageFactory.getFileStorage( GoogleSeleniumPlus.CACHE_PATH) key = '%s.%s.%s.%s' % (self.query, self.language, self.country, self.max_results) links = fileStorage.get(key) if not links: pages = int( math.ceil(self.max_results * 1.0 / GoogleSeleniumPlus.PAGE_LIMIT)) links = [] try: for start in range(pages): links.extend( self._search(start * GoogleSeleniumPlus.PAGE_LIMIT)) except Exception as ex: app_logger.error(u"%s" % ex) if not links and jump: from data_mining.search_engines.google.google_api_search import GoogleSearchEngine app_logger.error( u"Google Selenium Failed. Trying with SearchEngine") searchEngine = GoogleSearchEngine(self.query, self.language, self.country, self.googleHost, max_results=self.max_results) links = searchEngine.search(jump=False) if not links: raise Exception('Google Selenium Error') uniqueLinks = [] forbidden_regex = re.compile(settings.FORBIDDEN_URLS) for link in links: if link not in uniqueLinks: if not forbidden_regex.search(link): uniqueLinks.append(link) links = uniqueLinks[0:self.max_results] fileStorage.set(key, links) return links
def search(self): fileStorage = FileStorageFactory.getFileStorage( GoogleScraperRelated.CACHE_PATH) key = '%s.%s.%s' % (self.query, self.language, self.country) related = fileStorage.get(key) if not related: related = [] try: related = self._search(retries=settings.GOOGLE_SCRAPER_RETRIES) except Exception as ex: app_logger.error(u"_googleRelated %s" % ex) if not related: raise Exception('Google Scrapper Related Empty') fileStorage.set(key, related) return related
def search(self, jump=True, exactSearch=False): fileStorage = FileStorageFactory.getFileStorage( GoogleSearchEngine.CACHE_PATH) key = '%s.%s.%s.%s' % (self.query, self.language, self.country, self.max_results) links = fileStorage.get(key) if not links or not settings.CACHE: app_error_logger.error(80 * '-') app_error_logger.error( 'EO EO Estamos usando el metodo de pago $$$$$$') app_error_logger.error(80 * '-') try: self._search(self.dateRestrict, 1) links = [item.link for item in self.items] except Exception as ex: app_error_logger.error('%s' % ex) if not links and jump: app_error_logger.error( u"GoogleSearchEnginge Failed. Trying with Google Scrapper") from data_mining.search_engines.google.google_scraper import GoogleScraper googleScrapper = GoogleScraper(query=self.query, language=self.language, country=self.country, googleHost=self.googleHost, max_results=self.max_results) links = googleScrapper.search(jump=False, exactSearch=exactSearch) if not links: raise Exception('Google Download Error') uniqueLinks = [] for link in links: if link not in uniqueLinks: uniqueLinks.append(link) links = uniqueLinks fileStorage.set(key, links) return links
def getTokens(self, window=3, minCount=5, lowerLimit=0.45, positiveQueries=[], numTotal=40, display=False): if not self.terms: fileStorage = FileStorageFactory.getFileStorage( RelatedTerms.CACHE_PATH) key = u'relatedTerms_%s_%s_%s' % (self.seoLibrary.query, self.seoLibrary.language, self.seoLibrary.country) self.terms = fileStorage.get(key) if not self.terms or not settings.CACHE: self.terms = self._getTerms(window, minCount, lowerLimit, positiveQueries, numTotal) fileStorage.set(key, self.terms) if display: for word, metric in self.terms.items(): app_logger.debug(u'%s --> %s' % (word, metric)) return self.terms
def _saveProxies(self): fileStorage = FileStorageFactory.getFileStorage( ProxyBuyProxies.CACHE_PATH) key = u'_proxy' fileStorage.set(key, self.proxies, timeout=24 * 60 * 60)
def getSeoDocuments(self): ''' seoDocuments es un Diccionario porque de esa forma podíamos eliminar aquellos enlaces que estuvieran fallando sin tener que esperar a que sonara el timeout del pool https://docs.python.org/2/library/multiprocessing.html http://stackoverflow.com/questions/3160909/how-do-i-deal-with-certificates-using-curl-while-trying-to-access-an-https-url ''' fileStorage = FileStorageFactory.getFileStorage( SeoDocumentDownloader.CACHE_PATH) key = 'seoDocumentDownloader_%s_%s_%s_%s' % ( self.query, self.language, self.country, self.downloadLimit) seoDocumentDict = fileStorage.get(key, default={}) if not seoDocumentDict or not settings.CACHE or settings.SCRAPER_RELOAD_CONTENT: self.getLinks() downloadPool = WorkersPoolFactory.getPool() # print 'Urls to download: ' # print self.links app_download_logger.info('Urls to download: ') app_download_logger.info(self.links) results = [] regex = re.compile(settings.FORBIDDEN_URLS) for order, link in enumerate(self.links): #print '%s --> %s' % (order, link) if not regex.search(link): result = downloadPool.apply_async( getSeoDocumentConcurrence, args=(link, order, self.language, self.country, self.sameOrigin, self.useProxy)) results.append(result) seoDocumentDict = {} for result in results: try: seoDocument = result.get( timeout=settings.SEO_TERMS_DOWNLOADER_TIMEOUT) if seoDocument: seoDocumentDict[seoDocument.link] = seoDocument else: app_download_logger.error(u"No seoDocument or timeout") except Exception as ex: app_download_logger.error(u"%s" % ex) pass # Con esto nos aseguramos que no haya url repetidas app_download_logger.info( 'Number of documents downloaded (AFTER): %s' % len(seoDocumentDict)) app_download_logger.info('Max to download: %s' % len(self.links)) # print 'Number of documents downloaded (AFTER): %s' % len(seoDocumentDict) # print 'Max to download: %s' % len(self.links) fileStorage.set(key, seoDocumentDict) return sorted(seoDocumentDict.values(), key=lambda x: x.order, reverse=False)[0:self.downloadLimit]