#!/usr/bin/python # -*- coding: utf-8 -*- from utils.translation import ugettext as _ from utils.level import Level from utils.code import Code from seo.containers.ratings import TextAuditRating from seo.containers.seo_document import _getMandatoryBlockTokens from utils.median_distribution import getMedianDistributionInfo DISPLAY = False from utils.logger import LoggerFactory app_logger = LoggerFactory.getInstance('app') class MandatoryBlockLengthRating(object): RATING_NAME_TEMPLATE = u'%s-LENGTH-SCORE' def __init__(self, mandatoryField, seoLibrary, textSeoDocument): self.mandatoryField = mandatoryField self.seoLibrary = seoLibrary self.textSeoDocument = textSeoDocument self.RATING_NAME = MandatoryBlockLengthRating.RATING_NAME_TEMPLATE % self.mandatoryField.upper( ).replace(u'TOKENS', u'') def _getScore(self): lenList = [] for seoDocument in self.seoLibrary.seoDocuments: tokens = _getMandatoryBlockTokens(seoDocument,
from config import settings from utils.persistence.file_storage_factory import FileStorageFactory from seo.containers.seo_document import DataDocument from bs4.element import NavigableString from utils.concurrence.urllib3_pool_factory import Urllib3PoolFactory from urllib.parse import urlparse from data_mining.web_pages.scrapers.readability import Readability from utils.concurrence.request_factory import RequestFactory try: import magic except: magic = None # Windows from utils.logger import LoggerFactory app_download_logger = LoggerFactory.getInstance('downloader') class DownloadException(Exception): def __str__(self): return 'Download url error' class UserAgent(object): chrome = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' firefox = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1' safari = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A' seologies = 'Seologies/0.1 Bot' old = 'Mozilla/11.0'
filter string Controls turning on or off the duplicate content filter. "1": Turns on duplicate content filter. sort string The sort expression to apply to the results. """ import urllib import json import time from config import settings from utils.persistence.file_storage_factory import FileStorageFactory from utils.concurrence.urllib3_pool_factory import Urllib3PoolFactory from urllib3.util.retry import Retry from utils.logger import LoggerFactory from data_mining.search_engines.google.base_search import BaseSearchEngine app_download_logger = LoggerFactory.getInstance('downloader') app_error_logger = LoggerFactory.getInstance('app') class GoogleSearchEngine(BaseSearchEngine): URL_TEMPLATE = 'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s' CACHE_PATH = '/googleSearchEngine' def __init__(self, query, language='es', country='ES', googleHost='google.es', dateRestrict=None, max_results=settings.SEO_TERMS_DOWNLOAD_LIMIT,
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib3 import random from config import settings from utils.persistence.file_storage_factory import FileStorageFactory from multiprocessing import RLock from utils.logger import LoggerFactory app_logger = LoggerFactory.getInstance('proxy') class ProxyBase(object): ''' Base proy. Must implement: getProxies() _saveProxies() ''' proxy_basic_auth = None COUNTER = 0 SELECTED = 0 LOCK = RLock() def __init__(self): self.proxies = self.getProxies() self.recoverInvalidatedProxies() def getNextProxy(self): with ProxyBase.LOCK:
import tempfile import zlib import hashlib try: from .utils.six import cPickle as pickle except ImportError: import pickle from utils.persistence.utils.base import DEFAULT_TIMEOUT from utils.persistence.utils.filebased import FileBasedCache from utils.persistence.utils.move import file_move_safe from utils.persistence.utils.encoding import force_bytes from utils.logger import LoggerFactory app_cache_logger = LoggerFactory.getInstance('SeoAppCache') def keyFunction(key, key_prefix, version): """ Default function to generate keys. Constructs the key used by all other methods. By default it prepends the `key_prefix'. KEY_FUNCTION can be used to specify an alternate function with custom key making behavior. """ key = hashlib.md5(force_bytes(key)).hexdigest() return '%s:%s:%s' % (key_prefix, version, key) class FileStorage(FileBasedCache):
#!/usr/bin/python # -*- coding: utf-8 -*- import re import math from bs4 import BeautifulSoup from bs4 import Comment from bs4.element import NavigableString from utils.logger import LoggerFactory from data_mining.web_pages import scrapping_rules as rules from data_mining.web_pages.scrapers.base import ScraperBase SIGMA_MULTIPLIER = 1 SEPARATOR = ' ' app_logger = LoggerFactory.getInstance('SeoAppScrapper') def cleanHtml(rawHtml): ##rawHtml = re.sub(rules.REPLACE_BR_REGEX, "</p><p>", rawHtml) rawHtml = re.sub(rules.REPLACE_DOT_REGEX, ". ", rawHtml) ###rawHtml = re.sub(re.compile("<script.*>.*</script>"), "", rawHtml) return rawHtml def removeEmptyTags(soup): empty_tags = [] for tag in soup.find_all(): if not tag.contents: empty_tags.append(tag) [tag.extract() for tag in empty_tags]