#!/usr/bin/python # -*- coding: utf-8 -*- from utils.translation import ugettext as _ from utils.level import Level from utils.code import Code from seo.containers.ratings import TextAuditRating from seo.containers.seo_document import _getMandatoryBlockTokens from utils.median_distribution import getMedianDistributionInfo DISPLAY = False from utils.logger import LoggerFactory app_logger = LoggerFactory.getInstance('app') class MandatoryBlockLengthRating(object): RATING_NAME_TEMPLATE = u'%s-LENGTH-SCORE' def __init__(self, mandatoryField, seoLibrary, textSeoDocument): self.mandatoryField = mandatoryField self.seoLibrary = seoLibrary self.textSeoDocument = textSeoDocument self.RATING_NAME = MandatoryBlockLengthRating.RATING_NAME_TEMPLATE % self.mandatoryField.upper( ).replace(u'TOKENS', u'') def _getScore(self): lenList = [] for seoDocument in self.seoLibrary.seoDocuments: tokens = _getMandatoryBlockTokens(seoDocument,
def __init__(self): # BaseDAO.__init__(self) # super(BaseDAO, self).__init__() self.logger = LoggerFactory.create_daily_logger( __name__, PathMgr.get_log_path())
from config import settings from utils.persistence.file_storage_factory import FileStorageFactory from seo.containers.seo_document import DataDocument from bs4.element import NavigableString from utils.concurrence.urllib3_pool_factory import Urllib3PoolFactory from urllib.parse import urlparse from data_mining.web_pages.scrapers.readability import Readability from utils.concurrence.request_factory import RequestFactory try: import magic except: magic = None # Windows from utils.logger import LoggerFactory app_download_logger = LoggerFactory.getInstance('downloader') class DownloadException(Exception): def __str__(self): return 'Download url error' class UserAgent(object): chrome = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' firefox = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1' safari = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A' seologies = 'Seologies/0.1 Bot' old = 'Mozilla/11.0'
filter string Controls turning on or off the duplicate content filter. "1": Turns on duplicate content filter. sort string The sort expression to apply to the results. """ import urllib import json import time from config import settings from utils.persistence.file_storage_factory import FileStorageFactory from utils.concurrence.urllib3_pool_factory import Urllib3PoolFactory from urllib3.util.retry import Retry from utils.logger import LoggerFactory from data_mining.search_engines.google.base_search import BaseSearchEngine app_download_logger = LoggerFactory.getInstance('downloader') app_error_logger = LoggerFactory.getInstance('app') class GoogleSearchEngine(BaseSearchEngine): URL_TEMPLATE = 'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s' CACHE_PATH = '/googleSearchEngine' def __init__(self, query, language='es', country='ES', googleHost='google.es', dateRestrict=None, max_results=settings.SEO_TERMS_DOWNLOAD_LIMIT,
message = str(sys_error) status_code = 500 error_info = traceback.format_exc() log_all_exceptions(logger) # This should be skipped depending on config if (CFG_OBJ.get('DEBUG') == True): print(error_info) return make_response(jsonify(message=message), status_code) for error in default_exceptions.items(): app.error_handler_spec[None][error] = system_error_handler app.error_handler_spec[None][error[0]] = system_error_handler if __name__ == '__main__': # setup logger log_type = CFG_OBJ.get('LOGGING.TYPE') log_path = CFG_OBJ.get('LOGGING.PATH') logger = LoggerFactory.create(log_type, log_path) # log the app start message with a timestamp separator = '*' * 80 logger.log("\n" + separator) logger.log("Application Starting :: {}".format(datetime.now())) logger.log("\n" + separator) # run the app app.run(**CFG_OBJ.FLASK_RUN_OPTS)
def logger(self): return LoggerFactory.create_daily_logger( __name__, PathMgr.get_log_path('realtime'))
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib3 import random from config import settings from utils.persistence.file_storage_factory import FileStorageFactory from multiprocessing import RLock from utils.logger import LoggerFactory app_logger = LoggerFactory.getInstance('proxy') class ProxyBase(object): ''' Base proy. Must implement: getProxies() _saveProxies() ''' proxy_basic_auth = None COUNTER = 0 SELECTED = 0 LOCK = RLock() def __init__(self): self.proxies = self.getProxies() self.recoverInvalidatedProxies() def getNextProxy(self): with ProxyBase.LOCK:
else: message = str(sys_error) status_code = 500 error_info = traceback.format_exc() log_all_exceptions(logger) # This should be skipped depending on config if(CFG_OBJ.get('DEBUG') == True): print(error_info) return make_response(jsonify(message=message), status_code) for error in default_exceptions.items(): app.error_handler_spec[None][error] = system_error_handler app.error_handler_spec[None][error[0]] = system_error_handler if __name__ == '__main__': # setup logger log_type = CFG_OBJ.get('LOGGING.TYPE') log_path = CFG_OBJ.get('LOGGING.PATH') logger = LoggerFactory.create(log_type, log_path) # log the app start message with a timestamp separator = '*' * 80 logger.log("\n" + separator) logger.log("Application Starting :: {}".format(datetime.now())) logger.log("\n" + separator) # run the app app.run(**CFG_OBJ.FLASK_RUN_OPTS)
import tempfile import zlib import hashlib try: from .utils.six import cPickle as pickle except ImportError: import pickle from utils.persistence.utils.base import DEFAULT_TIMEOUT from utils.persistence.utils.filebased import FileBasedCache from utils.persistence.utils.move import file_move_safe from utils.persistence.utils.encoding import force_bytes from utils.logger import LoggerFactory app_cache_logger = LoggerFactory.getInstance('SeoAppCache') def keyFunction(key, key_prefix, version): """ Default function to generate keys. Constructs the key used by all other methods. By default it prepends the `key_prefix'. KEY_FUNCTION can be used to specify an alternate function with custom key making behavior. """ key = hashlib.md5(force_bytes(key)).hexdigest() return '%s:%s:%s' % (key_prefix, version, key) class FileStorage(FileBasedCache):
#!/usr/bin/python # -*- coding: utf-8 -*- import re import math from bs4 import BeautifulSoup from bs4 import Comment from bs4.element import NavigableString from utils.logger import LoggerFactory from data_mining.web_pages import scrapping_rules as rules from data_mining.web_pages.scrapers.base import ScraperBase SIGMA_MULTIPLIER = 1 SEPARATOR = ' ' app_logger = LoggerFactory.getInstance('SeoAppScrapper') def cleanHtml(rawHtml): ##rawHtml = re.sub(rules.REPLACE_BR_REGEX, "</p><p>", rawHtml) rawHtml = re.sub(rules.REPLACE_DOT_REGEX, ". ", rawHtml) ###rawHtml = re.sub(re.compile("<script.*>.*</script>"), "", rawHtml) return rawHtml def removeEmptyTags(soup): empty_tags = [] for tag in soup.find_all(): if not tag.contents: empty_tags.append(tag) [tag.extract() for tag in empty_tags]