def __init__(self, key, proxy=None): self.base_url = "https://api.datamarket.azure.com/Bing/SearchWeb/v1/Web?$format=json&" # Building authentification from key s = '%s:%s' % (key, key) credentials = base64.b64encode(s.encode('utf-8')) self.auth = 'Basic %s' % credentials.decode('utf-8') # Markets for localized and more accurate search self.markets = { "FR": "fr-FR", "BE": "fr-BE", "GB": "en-GB", "US": "en-US", "DE": "de-DE", "UK": "en-GB" } # Fetcher initialization self.fetcher = Fetcher(proxy=proxy) self.fetcher.headers["Authorization"] = self.auth # Logging initialization self.logger = logging.getLogger("webmining:bingapi_fetcher") self.logger.setLevel(logging.INFO) self.wrapper = HTML5Wrapper()
def __init__(self, proxy): self.fetcher = Fetcher(proxy=proxy) # CMS identifiables via a specific URL self.paths = { "wordpress": { "path": "wp-login.php", "expression": "wordpress" }, "drupal": { "path": "user", "expression": "user-login" }, "isotools": { "path": "identification.aspx", "expression": "isotools" }, "joomla": { "path": "administrator", "expression": "joomla" }, "spip": { "path": "?page=login", "expression": "spip" } } # CMS identifiables via a specific pattern in HTML self.patterns = { "typo3": { "expression": "this website is powered by typo3" }, "ezpublish": { "expression": "/content/advancedsearch" } }
def __init__(self, proxy=None): self.fetcher = Fetcher(proxy=proxy) self.normalizer = normalizer.Normalizer() self.wrapper = HTML5Wrapper() self._rdomain = re.compile("^[a-z]{2,3}\\.linkedin\\.com$") self._rpath1 = re.compile( "^\\/pub\\/[^\\/]+(\\/[0-9abcdef]{1,3}){3}(\\/[a-zA-Z]+)?$") self._rpath2 = re.compile("^\\/in\\/[^\\/]+") self._rtitle = re.compile("^(.+) - ([^\\|]+) \\| LinkedIn$")
def __init__(self, tld="fr", proxy=None): self.fetcher = Fetcher(proxy=proxy) # No tld based differences in bing, the tld will be ignored # http://fr.search.yahoo.com/search?p=LE+PAC+DECOUPE+PORTET+SUR+GARONNE&toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-703 self.base_url = "http://fr.search.yahoo.com/search?toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-703&" # Logging initialization self.logger = logging.getLogger("webmining:yahoo_fetcher") self.logger.setLevel(logging.INFO) self.wrapper = HTML5Wrapper()
def __init__(self, proxy=None): self.fetcher = Fetcher(proxy=proxy) self.wrapper = HTML5Wrapper() self.normalizer = normalizer.Normalizer() self.valid_domains = re.compile("^.*.viadeo." + self.LANG + "$") self._rpath1 = re.compile("^\\/" + self.LANG + "\\/profile\\/([^\\/]+).*$") self._rpath2 = re.compile("^\\/r\\/profile\\/([^\\/]+)\\/" + self.LANG + "\\/public(\\/.*)?$") self._rtitle = re.compile("^([^,]+).*$")
def __init__(self, tld="fr", proxy=None): self.fetcher = Fetcher(proxy=proxy) self.cookie = {"_FS": "NU=1&mkt=fr-FR&ui=fr-FR"} # No tld based differences in bing, the tld will be ignored self.base_url = "http://www.bing.com/search?qs=n&form=QBLH&filt=all&sc=0-13&sp=-1&sk=&pq=" # Logging initialization self.logger = logging.getLogger("webmining:bing_fetcher") self.logger.setLevel(logging.INFO) self.wrapper = HTML5Wrapper()
class ShoppingExtractor: # Shopping: OSCommerce, Prestashop, Magento, Open Cart def __init__(self, proxy): self.fetcher = Fetcher(proxy=proxy) # Shopping engines identifiables via a specific cookie self.cookies = {"oscommerce": {"cookie": "osCsid"}} # Shopping engines identifiables via a specific pattern in HTML self.patterns = { "prestashop": { "tag": "meta", "attribute": "content", "expression": "prestashop" }, "magento": { "tag": "link", "attribute": "href", "expression": "/skin/frontend/" }, "opencart": { "tag": "a", "attribute": "href", "expression": "route=checkout/cart" } } def extract(self, dom, raw_txt, relevant_txt, url, firstpage, country="FR", lang="FR"): results = {"ecommerce": []} # This needs a fetch, we only do it for the first page of the crawl if firstpage: # CMS identifiables via a specific URL for shop in self.cookies: fr = self.fetcher.fetch(url) if fr is not None and self.cookies[shop][ "cookie"] in fr.cookies.keys(): results["ecommerce"].append({"type": shop, "url": url}) return results # CMS identifiables via a specific pattern in HTML for shop in self.patterns: tags = dom(self.patterns[shop]["tag"] + "[" + self.patterns[shop]["attribute"] + "]") for tag in tags.items(): if self.patterns[shop]["expression"] in ( tag.attr[self.patterns[shop]["attribute"]] or "").lower(): results["ecommerce"].append({"type": shop, "url": url}) return results return results
class YahooFetcher: """ Fetches Yahoo results for a given query """ def __init__(self, tld="fr", proxy=None): self.fetcher = Fetcher(proxy=proxy) # No tld based differences in bing, the tld will be ignored # http://fr.search.yahoo.com/search?p=LE+PAC+DECOUPE+PORTET+SUR+GARONNE&toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-703 self.base_url = "http://fr.search.yahoo.com/search?toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-703&" # Logging initialization self.logger = logging.getLogger("webmining:yahoo_fetcher") self.logger.setLevel(logging.INFO) self.wrapper = HTML5Wrapper() def parse(self, webpage, bresults, limit): webpage = self.wrapper.pq(webpage) if webpage is not None: for r in webpage(".res").items(): gr = YahooResult(r, self.wrapper) bresults.append(gr) if len(bresults) >= limit: break return webpage def fetch(self, q, limit=10, start=0): """ Fetches Yahoo with the query q and sends back a list of results. param: q: a query, as a string param: limit: the amount of results needed (1 to 10) param: start: the starting offset return: a list of YahooResult """ bresults = [] # NB: the parameter to augment the amount of results is 'count' query = urllib.parse.urlencode({"p": q}) url = self.base_url + query fr = self.fetcher.fetch(url, debug=True) self.logger.debug("Fetched url [%s]" % url) if fr is None or fr.webpage is None: self.logger.warn("Got nothing from [%s]" % url) return bresults self.logger.debug("Returned result - " + str(fr.fetched_url)) f = open("index.html", "w") f.write(fr.webpage) f.close() fr.webpage = self.parse(fr.webpage, bresults, limit) self.logger.info("Fetched [%s] with %d results" % (url, len(bresults))) return bresults
def __init__(self, filename=None, seedlist=None, debug=False, proxy=None, multiproc=True, mode=CrawlMode.entire, max_page_size=PAGE_SIZE_LIMIT): """ :param filename: path to the seed file :param mode: crawling mode, either "entire", "single", "subpath" """ self.seed = None self.debug = debug # init the fetcher with a download limit size self.fetcher = Fetcher(proxy, max_page_size=max_page_size) self.htmltools = HTML5Wrapper() self.crawl_depth = 0 # Do we crawl domains outside the seed self.domain_depth = 0 # At which depth each seed element must be crawled self.page_limit = 0 # Max amount of pages to be crawled self.max_page_size = max_page_size self.website = Website() self.me = MetaExtractor(proxy=proxy) self.badextensions = set(["pdf", "xls", "doc", "ppt", "rtf", "odt", "zip", "tar.gz", "tar", "exe", \ "jpg", "png", "jpeg", "bmp", "gif", "mp3", "flv", "rar", "ogv", "avi", "mp4", \ "mkg", "ps", "ogg", "webm", "ogm", "pps", "pptx", "docx", "xlsx", "mpg", "mov", \ "mkv", "mpeg", "m4v", "iso"]) self.crawling_process_over = False # Logging initialization self.logger = logging.getLogger("webmining:crawler") self.logger.setLevel(logging.INFO) if debug: self.logger.setLevel(logging.DEBUG) self.filename = filename self.seedlist = seedlist self.mode = mode self.authorized_domains = set()
class BingFetcher: """ Fetches Bing results for a given query """ def __init__(self, tld="fr", proxy=None): self.fetcher = Fetcher(proxy=proxy) self.cookie = {"_FS": "NU=1&mkt=fr-FR&ui=fr-FR"} # No tld based differences in bing, the tld will be ignored self.base_url = "http://www.bing.com/search?qs=n&form=QBLH&filt=all&sc=0-13&sp=-1&sk=&pq=" # Logging initialization self.logger = logging.getLogger("webmining:bing_fetcher") self.logger.setLevel(logging.INFO) self.wrapper = HTML5Wrapper() def parse(self, webpage, bresults, limit): webpage = self.wrapper.pq( webpage ) if webpage is not None: for r in webpage("li.sa_wr").items(): gr = BingResult(r, self.wrapper) bresults.append(gr) if len(bresults) >= limit: break return webpage def fetch(self, q, limit=10, start=0): """ Fetches Bing with the query q and sends back a list of results. param: q: a query, as a string param: limit: the amount of results needed (1 to 10) param: start: the starting offset return: a list of BingResult """ bresults = [] # NB: the parameter to augment the amount of results is 'count' query = urllib.parse.urlencode( { "q": q, "first": start } ) url = self.base_url + query fr = self.fetcher.fetch(url, debug=True, cookies=self.cookie) self.logger.debug("Fetched url [%s]" % url) if fr is None or fr.webpage is None: self.logger.warn("Got nothing from [%s]" % url) return bresults self.logger.debug("Returned result - " + str(fr.fetched_url)) fr.webpage = self.parse(fr.webpage, bresults, limit) self.logger.info("Fetched [%s] with %d results" % (url, len(bresults))) return bresults
def __init__(self, proxy): self.fetcher = Fetcher(proxy=proxy) # Shopping engines identifiables via a specific cookie self.cookies = {"oscommerce": {"cookie": "osCsid"}} # Shopping engines identifiables via a specific pattern in HTML self.patterns = { "prestashop": { "tag": "meta", "attribute": "content", "expression": "prestashop" }, "magento": { "tag": "link", "attribute": "href", "expression": "/skin/frontend/" }, "opencart": { "tag": "a", "attribute": "href", "expression": "route=checkout/cart" } }
class GoogleFetcher: """ Fetches google results for a given query """ def __init__(self, tld="fr", proxy=None): self.fetcher = Fetcher(proxy=proxy) self.wrapper = HTML5Wrapper() self.base_url = "http://www.google.%s/search?rls=en&ie=UTF-8&oe=UTF-8&" % tld def parse(self, webpage, gresults, limit): webpage = self.wrapper.pq(webpage) #html = html.decode( "utf-8" ) if webpage is not None: for r in webpage(".g").items(): gr = GoogleResult(r, self.wrapper) gresults.append(gr) if len(gresults) >= limit: break return webpage def fetch(self, q, limit=10, start=0): """ Fetches Google with the query q and sends back a list of results. param: q: a query, as a string param: limit: the amount of results needed (1 to 10) param: start: the starting offset return: a list of GoogleResult """ gresults = [] query = urllib.parse.urlencode({"q": q, "start": start}) url = self.base_url + query fr = self.fetcher.fetch(url, debug=True) if fr is None or fr.webpage is None: return gresults if fr.fetched_url.startswith("http://www.google.fr/sorry/"): raise GoogleBlacklistingError() fr.webpage = self.parse(fr.webpage, gresults, limit) return gresults
class CMSExtractor: # CMS: Wordpress, Drupal, Typo3, IsoTools, Joomla, Spip, EzPublish def __init__(self, proxy): self.fetcher = Fetcher(proxy=proxy) # CMS identifiables via a specific URL self.paths = { "wordpress": { "path": "wp-login.php", "expression": "wordpress" }, "drupal": { "path": "user", "expression": "user-login" }, "isotools": { "path": "identification.aspx", "expression": "isotools" }, "joomla": { "path": "administrator", "expression": "joomla" }, "spip": { "path": "?page=login", "expression": "spip" } } # CMS identifiables via a specific pattern in HTML self.patterns = { "typo3": { "expression": "this website is powered by typo3" }, "ezpublish": { "expression": "/content/advancedsearch" } } def extract(self, dom, raw_txt, relevant_txt, url, firstpage, country="FR", lang="FR"): results = {"cms": []} found = set() # CMS identifiables via a specific URL # This needs a fetch, we only do it for the first page of the crawl if firstpage: for cms in self.paths: up = urlparse(url) domain = up.scheme + "://" + up.netloc link = urljoin(domain, self.paths[cms]["path"]) fr = self.fetcher.fetch(link) if fr is not None and fr.webpage is not None and \ fr.content_type is not None and "text/html" in fr.content_type.lower() and \ self.paths[cms]["expression"] in fr.webpage.lower(): if cms not in found: results["cms"].append({"type": cms, "url": link}) found.add(cms) # return results # CMS identifiables via a specific pattern in HTML for cms in self.patterns: if self.patterns[cms]["expression"] in raw_txt.lower(): if cms not in found: results["cms"].append({"type": cms, "url": url}) found.add(cms) # return results # detect typo3 via meta as well if "typo3" not in results and len( dom("meta[name='generator'][content*='TYPO3']")) > 0: cms = "typo3" if cms not in found: results["cms"].append({"type": cms, "url": url}) found.add(cms) return results
class LinkedinFetcher: def __init__(self, proxy=None): self.fetcher = Fetcher(proxy=proxy) self.normalizer = normalizer.Normalizer() self.wrapper = HTML5Wrapper() self._rdomain = re.compile("^[a-z]{2,3}\\.linkedin\\.com$") self._rpath1 = re.compile( "^\\/pub\\/[^\\/]+(\\/[0-9abcdef]{1,3}){3}(\\/[a-zA-Z]+)?$") self._rpath2 = re.compile("^\\/in\\/[^\\/]+") self._rtitle = re.compile("^(.+) - ([^\\|]+) \\| LinkedIn$") def validate_url(self, domain, path): """ Validates if an url is a linkedin profile or not. param: domain: The URL domain param: path: The URL path return: true/false """ # Valid domain and profile path return self._rdomain.match(domain) is not None and ( self._rpath1.match(path) is not None or self._rpath2.match(path) is not None) def validate_contact(self, title, firstname, lastname): """ Validates if the profile page corresponds to the specified contact. param: title: The page title param: firstname: The contact first name param: lastname: The contact last name return: True if the page corresponds to the specified contact, False otherwise """ # Extract name from title m = self._rtitle.search(title) # Matching title if m is not None: return self.normalize_name( m.group(1)) == self.normalize_name(firstname + lastname) # Invalid return False def normalize_name(self, name): """ Normalize a name for comparison param: name: The name to normalize return: The normalized name for comparison (firstname + lastname, lowercase ASCII, withtout separators) """ text = re.sub('[\-0-9\s]+', '', name) text = self.normalizer.normalize_text(text) return text def parse(self, fr): html = self.wrapper.pq(fr.webpage) lr = LinkedinResult(html, url=fr.fetched_url, wrapper=self.wrapper) return lr def extract_profile(self, url): """ Fetches profile URL and cleans html. """ fr = self.fetcher.fetch(url, debug=False) if fr is None or fr.webpage is None or fr.http_status >= 400: return None lr = self.parse(fr) return lr
def __init__(self, token): self.fetcher = Fetcher() self.logger = logging.getLogger("fbapi") self.token = "%s|%s" % (token["app_id"], token["secret_id"]) self.logger.setLevel(logging.INFO)
class FBAPI: """ Interrogates Facebook API to get various pieces of information from pages. """ def __init__(self, token): self.fetcher = Fetcher() self.logger = logging.getLogger("fbapi") self.token = "%s|%s" % (token["app_id"], token["secret_id"]) self.logger.setLevel(logging.INFO) def get_graph(self, fburl): """ Gets the graph API json from a facebook url param: fburl: fb page as an URL :returns: a string, or None if nothing found. """ # Building graph URL from company page url # https://graph.facebook.com/datapublica account = get_facebook_account(fburl) if account is None: return None # See bug https://data-publica.atlassian.net/browse/RAD-265 if b"\x85" in account.encode(): return None url = FB_API_URL + account + "?access_token=" + self.token data = self.fetcher.fetch(url) jdata = json.loads(data.webpage) if "error" in jdata.keys(): if jdata["error"]["code"] == 4: self.logger.warn("Rate limit exceeded") raise RateLimitExceeded() if jdata["error"]["code"] == 803: self.logger.warn( "Couldn't find company FB page for URL %s, built into %s" % (fburl, url)) return None elif jdata["error"]["code"] == 100: self.logger.warn( "Couldn't access FB page for URL %s, badly built into %s" % (fburl, url)) return None elif jdata["error"]["code"] == 104: self.logger.warn( "Auhentification request for FB page with URL %s, built into %s" % (fburl, url)) return None elif jdata["error"]["code"] == 2500: self.logger.warn( "Unkown path to FB page with URL %s, built into %s" % (fburl, url)) return None elif jdata["error"]["code"] == 12: self.logger.warn( "Call to deprecated FB point with URL %s, built into %s" % (fburl, url)) return None elif jdata["error"]["code"] == 21: m = re.search("to page ID (\d+).", jdata["error"]["message"]) self.logger.info("FB page with URL %s, was migrated into %s" % (fburl, m.group(1))) return self.get_graph("https://graph.facebook.com/" + m.group(1)) else: raise Exception( "Unknown error %d : %s" % (jdata["error"]["code"], jdata["error"]["message"])) return jdata def get_company(self, fburl): """ Gets a company overview from a company facebook page. param: fburl: fb page as an URL :returns: a string, or None if nothing found. """ graph = self.get_graph(fburl) return self.get_company_from_data(graph) @staticmethod def get_company_from_data(fbdata): if fbdata is None: return None return FBCompany(fbdata) def get_picture(self, account): account = get_facebook_account(account) if account is None: return None url = "https://graph.facebook.com/%s/picture?redirect=false&type=large" % account data = self.fetcher.fetch(url) jdata = json.loads(data.webpage) if "error" in jdata: return None if "data" not in jdata: return None return jdata["data"]
def __init__(self, tld_file, proxy, wrapper, check_supported=True): # Logging initialization self.logger = logging.getLogger("webmining:communication_extractor") self.logger.setLevel(logging.INFO) self.check_supported = check_supported # Communication: SIREN, phone, contact form, emails, RSS, RSS/week, legal mention, # Mobile site, responsive site self.metas = [ "localId", "phone", "email", "contact", "contactform", "legal", "useterms", "rss", "mobile", "responsive", "capital", "description", "addresses" ] self.check_supported = check_supported # Loading all the localized resources resources_dir = os.path.join(LIB_PATH, "resources/localization") if not os.path.exists(resources_dir): raise NotImplementedError("No resources") # Cache where country specific resources are cached self.localization_cache = {} # Cache containing the current domain’s fetched rss links self.rss_cache = set() # Cache containing information for email filtering self.email_filtering_data = None with open(os.path.join(LIB_PATH, "resources", "email_filtering.json"), "r") as f: self.email_filtering_data = json.load(f) # Iterating over country specific resources for path in os.listdir(resources_dir): # We consider that all directories in the resources_dir represent a # country if os.path.isdir(os.path.join(resources_dir, path)): country_name = path country_path = os.path.join(resources_dir, path) country = namedtuple( "country", ["legals", "useterms", "identification", "generic_emails"]) with open(os.path.join(country_path, "generic_emails.txt"), "r") as f: country.generic_emails = set(map(str.strip, f.readlines())) with open(os.path.join(country_path, "legals.txt"), "r") as f: country.legals = set(map(str.strip, f.readlines())) with open(os.path.join(country_path, "useterms.txt"), "r") as f: country.useterms = set(map(str.strip, f.readlines())) with open(os.path.join(country_path, "identification.txt"), "r") as f: country.identification = set( map(lambda x: re.compile(x.strip()), f.readlines())) self.localization_cache[country_name] = country self.contacter = ContactDetecter() self.extor = Extractor() self.ad = address_detecter.AddressDetecter( cache_results=True, check_supported=check_supported) self.tlds = set() self.tel = PhoneDetecter() self.fetcher = Fetcher(proxy=proxy) self.iosfetcher = Fetcher( proxy=proxy, user_agent= "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3" ) self.wrapper = wrapper # Used to tell when to empty the address detecter’s results cache # i.e. when we change website self.previous_domain = None # Allow countries to specify an other country code for phone detection self.phone_country = {'UK': 'GB'} # TLDin this file are from IANA organisation with open(tld_file) as f: for tld in f: self.tlds.add(tld.strip())
class CommunicationExtractor: def __init__(self, tld_file, proxy, wrapper, check_supported=True): # Logging initialization self.logger = logging.getLogger("webmining:communication_extractor") self.logger.setLevel(logging.INFO) self.check_supported = check_supported # Communication: SIREN, phone, contact form, emails, RSS, RSS/week, legal mention, # Mobile site, responsive site self.metas = [ "localId", "phone", "email", "contact", "contactform", "legal", "useterms", "rss", "mobile", "responsive", "capital", "description", "addresses" ] self.check_supported = check_supported # Loading all the localized resources resources_dir = os.path.join(LIB_PATH, "resources/localization") if not os.path.exists(resources_dir): raise NotImplementedError("No resources") # Cache where country specific resources are cached self.localization_cache = {} # Cache containing the current domain’s fetched rss links self.rss_cache = set() # Cache containing information for email filtering self.email_filtering_data = None with open(os.path.join(LIB_PATH, "resources", "email_filtering.json"), "r") as f: self.email_filtering_data = json.load(f) # Iterating over country specific resources for path in os.listdir(resources_dir): # We consider that all directories in the resources_dir represent a # country if os.path.isdir(os.path.join(resources_dir, path)): country_name = path country_path = os.path.join(resources_dir, path) country = namedtuple( "country", ["legals", "useterms", "identification", "generic_emails"]) with open(os.path.join(country_path, "generic_emails.txt"), "r") as f: country.generic_emails = set(map(str.strip, f.readlines())) with open(os.path.join(country_path, "legals.txt"), "r") as f: country.legals = set(map(str.strip, f.readlines())) with open(os.path.join(country_path, "useterms.txt"), "r") as f: country.useterms = set(map(str.strip, f.readlines())) with open(os.path.join(country_path, "identification.txt"), "r") as f: country.identification = set( map(lambda x: re.compile(x.strip()), f.readlines())) self.localization_cache[country_name] = country self.contacter = ContactDetecter() self.extor = Extractor() self.ad = address_detecter.AddressDetecter( cache_results=True, check_supported=check_supported) self.tlds = set() self.tel = PhoneDetecter() self.fetcher = Fetcher(proxy=proxy) self.iosfetcher = Fetcher( proxy=proxy, user_agent= "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3" ) self.wrapper = wrapper # Used to tell when to empty the address detecter’s results cache # i.e. when we change website self.previous_domain = None # Allow countries to specify an other country code for phone detection self.phone_country = {'UK': 'GB'} # TLDin this file are from IANA organisation with open(tld_file) as f: for tld in f: self.tlds.add(tld.strip()) def get_country(self, country): if country not in self.localization_cache: if self.check_supported: raise NotImplementedError("No resource for country %s" % country) else: self.logger.warn("Unsupported country %s" % country) return None return self.localization_cache[country] def extract(self, dom, raw_txt, relevant_txt, url, firstpage, country="FR", lang="FR"): results = {} domain = urlparse(url).hostname if self.previous_domain is None or domain != self.previous_domain: self.previous_domain = domain self.ad.empty_cache() if firstpage: self.rss_cache = set() results["localId"] = self.extract_id(raw_txt, country=country) results["phone"], results["fax"] = self.extract_phone(raw_txt, country=country) results["email"] = self.extract_email(dom, raw_txt, domain, country=country) results["contact"] = self.extract_contacts(raw_txt) results["legal"] = self.extract_legal(raw_txt) results["useterms"] = self.extract_useterms(raw_txt) results["rss"] = self.extract_rss(dom, url) results["responsive"] = self.extract_responsive(dom) results["description"] = self.extract_description(dom) results["capital"] = self.extor.extract_capital(raw_txt) results["addresses"] = self.ad.detect_addresses(raw_txt, html=False, country=country) # This extraction does an xtra fetch, we only do it for the first page if firstpage: results["mobile"] = self.extract_mobile(url) if self.extract_contactform(dom): results["contactform"] = url else: results["contactform"] = None return results def extract_mobile(self, url): """ http://www.cabinetnardi.com/mobile/ http://le-choix-funeraire.mobi/ http://iphone.revision-et-finance-cogefor.fr http://m.agencecomtesse.com """ up = urlparse(url) domain = up.scheme + "://" + up.netloc fr = self.iosfetcher.fetch(domain) if fr is not None and fr.fetched_url != domain: if "mobile" in fr.fetched_url or \ ".mobi" in fr.fetched_url or \ "iphone" in fr.fetched_url or \ "//m." in fr.fetched_url: return True return None def extract_responsive(self, dom): return len(dom("meta[name='viewport']")) > 0 def extract_description(self, dom): """ Extracts content from meta description in headers param: dom: the dom where to apply extraction """ description = None desc = dom("meta[name='description']") # TODO: manage og # desc = dom("meta[name='og:description']") if desc.length > 0: description = "" for d in desc.items(): if d is not None and d.attr is not None and \ d.attr.content is not None: description += d.attr.content + ' ' if description is None or not self._validate_description(description): return None # Remove HTML tags if present, but keep newline tags as newlines for tag in HTML5Wrapper.newline_tags: regex = "</?%s.*?>" % tag description = re.sub(regex, "\n", description, flags=re.I | re.M) # Remove remaining tags description = re.sub("<.+?>", " ", description, flags=re.M) # Remove supernumerary newlines and spaces description = re.sub(r"\n{2,}", "\n", description) description = re.sub(" {2,}", " ", description) return description.strip() def _validate_description(self, desc): """ Determines if an extracted description seems to be a quality one. """ badstart = ("site", "bienvenue", "joomla", "wordpress") badend = ("...") normed = desc.lower().strip() if normed.startswith(badstart): return False if normed.endswith(badend): return False wf = WFHistogram(normed) if len(wf.freq) < 5: return False return True def _find_rss(self, dom, url): domain = urlparse(url).netloc rsslink = None # First looking into head links # supports "rss+xml" for link in dom("head link[type*='application/rss'][href]").items(): rsslink = urljoin(url, link.attr.href) break if rsslink is None: for node in dom("a[href]").items(): href = node.attr.href # If this link could be a rss one if "rss" in href.lower(): rsslink = "" if href.startswith("http"): if domain in url: rsslink = href else: continue # Build absolute link from relative link else: rsslink = urljoin(url, href) break # replace feed:// with http:// if rsslink is not None and rsslink.startswith("feed:"): rsslink = rsslink[5:] if rsslink.startswith("//"): rsslink = "http:" + rsslink # supports feed:https:// as well! # If the rss feed is unknown, we return it if rsslink not in self.rss_cache: self.rss_cache.add(rsslink) return rsslink else: return None def extract_rss(self, dom, url): rsslink = self._find_rss(dom, url) # no rss found if rsslink is None: return (None, None) # One a potential RSS link has been found, let's check it out return self._compute_rss_stats(rsslink, self.fetcher.fetch(rsslink, debug=True)) def _compute_rss_stats(self, rsslink, fr): if fr is not None and not ( "application/xml" in fr.headers["content-type"] or "text/xml" in fr.headers["content-type"] or "application/rss+xml" in fr.headers["content-type"]): return (None, None) try: rss = self.wrapper.pq(fr.webpage) except (lxml.etree.XMLSyntaxError, lxml.etree.ParserError): return (rsslink, 0) # Now let's get more recent and oldest item dates in stream first = last = None count = 0 for entry in rss("item").items(): count += 1 date = feedparser._parse_date(entry("pubDate").text()) if date is not None: publication = time.mktime(date) if first is None or first < publication: first = publication if last is None or last > publication: last = publication # Compute ratio items per week if first is not None and last is not None: timedelta = first - last if timedelta > 0: weekratio = count / (timedelta / (7 * 24 * 60 * 60)) return (rsslink, weekratio) return (rsslink, 0) def extract_legal(self, raw_txt, country="FR"): country = self.get_country(country) if country is None: return None low = raw_txt.lower() for i in country.legals: if i in low: return True return None def extract_useterms(self, raw_txt, country="FR"): country = self.get_country(country) if country is None: return None low = raw_txt.lower() for i in country.useterms: if i in low: return True return None def extract_contactform(self, dom): """ Searches a contact form in page by looking input names in forms. """ """ Searches a contact form in page. Uses a linear classifier. """ c = ContactFormExtractor(dom) if c.predict(): return True else: return None def extract_id(self, txt, country="FR"): """ Tries to extract ID (siren, siret, TVA, KBO, etc…) from page text """ re_country = self.get_country(country) if re_country is None: return None lower_txt = txt.lower() for regex in re_country.identification: m = re.search(regex, lower_txt) if m is not None: ide = re.sub('[^\d]', '', m.group(1)) # Checking extraction quality if country == "BE": if len(ide) < 10: ide = "0" + ide if len(ide) != 10: return None elif country == "FR": if len(ide) != 9: return None elif country == 'UK': if len(ide) == 7: ide = "0" + ide return ide return None def extract_contacts(self, raw_txt): return self.contacter.detect(raw_txt) def extract_phone(self, raw_txt, country="FR"): """ Returns a tuple containing : - a list of detected phones - a list of detected faxes """ phone_country_ = country if country in self.phone_country: phone_country_ = self.phone_country[country] results = self.tel.detect(raw_txt, country=phone_country_) phones = [r[1] for r in results if r[0] == "phone"] faxes = [r[1] for r in results if r[0] == "fax"] return (phones, faxes) def _validate_email(self, email, domain, country="FR"): """ Checks out that the email is valid and usable. Sorts emails between generic ones and direct contacts. param: email: a str believed to be an email param: domain: the domain of the analyzed website; used to determine if an email address is really related to the website return: a tuple (email, is_contact) where is_contact in [True, False] False is for generic contact emails such as [email protected] """ if self.check_supported: country = self.get_country(country) email = email.strip().lower() # We accept at maximum 3 sub-domains of mail m = re.search("([\w\.\-]+@[\w\-]+(\.[\w\-]+){1,3})", email) if m is not None: # email is validated, but let's check it's not a generic email email = m.group(1) prefix, suffix = email.split('@') # Bad suffix (domain.com, example.com...) if suffix in self.email_filtering_data["domains_blacklist"]: return None # Bad tld in extracted email if suffix.split(".")[-1] not in self.tlds: self.logger.info(">>> TLD refused : %s" % email) return None # Email prefix in blacklist (CNIL...) if prefix in self.email_filtering_data["prefixes_blacklist"]: self.logger.info(">>> Blacklisted email prefix found: %s" % email) return None # Fuzzy match between the suffix and the domain fuzzy_match = fuzz.token_sort_ratio(suffix, domain) # This value should be tested against a real database of examples fuzzy_threshold = 70 if fuzzy_match < fuzzy_threshold: # Test email providers domains: if we find an email @wanadoo.fr, # we can't be sure it's not a real one if not any([ fuzz.token_sort_ratio(suffix, d) >= fuzzy_threshold for d in self.email_filtering_data["email_providers"] ]): return None self.logger.info("> found [" + email + "]") for pattern in country.generic_emails: if re.match(pattern, prefix) is not None: return (email, False) return (email, True) else: self.logger.warning("WARNING>> unvalidated email : " + email) return None def extract_email(self, dom, raw_txt, domain, country="FR"): """ Tries to extract email adress from mailto structure. If nothing found, tries a detection from raw text. """ for node in dom("a[href^='mailto:']").items(): # <a href="mailto:[email protected]">Clique ici pour m'envoyer un e-mail</a> mail = node.attr.href[7:] clear = mail.lower().split('?') if len(clear) > 0: return self._validate_email(clear[0], domain, country) else: continue # If no mailto found, let's try to extract an email from raw text # Not a findall for performance reasons m = re.search("[\s:]([\w\.\-]+@[\w\.\-]+)[\s\"<]", raw_txt + " ") if m is not None: return self._validate_email(m.group(1), domain, country) return None
class ViadeoFetcher: LANG = "(com|de|en|es|fr|gb|it)" def __init__(self, proxy=None): self.fetcher = Fetcher(proxy=proxy) self.wrapper = HTML5Wrapper() self.normalizer = normalizer.Normalizer() self.valid_domains = re.compile("^.*.viadeo." + self.LANG + "$") self._rpath1 = re.compile("^\\/" + self.LANG + "\\/profile\\/([^\\/]+).*$") self._rpath2 = re.compile("^\\/r\\/profile\\/([^\\/]+)\\/" + self.LANG + "\\/public(\\/.*)?$") self._rtitle = re.compile("^([^,]+).*$") def validate_url(self, domain, path): """ Validates if an url is a viadeo profile or not. param: domain: The URL domain param: path: The URL path return: true/false """ # Valid domain and profile path return self.valid_domains.match(domain) is not None and \ (self._rpath1.match(path) is not None or self._rpath2.match(path) is not None) def validate_contact(self, title, firstname, lastname): """ Validates if the profile page corresponds to the specified contact. param: title: The page title param: firstname: The contact first name param: lastname: The contact last name return: True if the page corresponds to the specified contact, False otherwise """ # Extract name from title title = title.replace("<b>", "").replace("</b>", "") m = self._rtitle.search(title) # Matching title if m is not None: return self.normalize_name( m.group(1)) == self.normalize_name(firstname + lastname) # Invalid return False def normalize_name(self, name): """ Normalize a name for comparison param: name: The name to normalize return: The normalized name for comparison (firstname + lastname, lowercase ASCII, withtout separators) """ text = re.sub('[\-0-9\s]+', '', name) text = self.normalizer.normalize_text(text) return text def parse(self, fr): html = self.wrapper.pq(fr.webpage) lr = ViadeoResult(html, url=fr.fetched_url, wrapper=self.wrapper) return lr def extract_profile(self, url): """ Fetches profile URL and cleans html. """ # Extract profile fr = self.fetcher.fetch(url, debug=False) if fr is None or fr.webpage is None: return None return self.parse(fr)
class BingAPIFetcher: """ Fetches Bing results for a given query """ def __init__(self, key, proxy=None): self.base_url = "https://api.datamarket.azure.com/Bing/SearchWeb/v1/Web?$format=json&" # Building authentification from key s = '%s:%s' % (key, key) credentials = base64.b64encode(s.encode('utf-8')) self.auth = 'Basic %s' % credentials.decode('utf-8') # Markets for localized and more accurate search self.markets = { "FR": "fr-FR", "BE": "fr-BE", "GB": "en-GB", "US": "en-US", "DE": "de-DE", "UK": "en-GB" } # Fetcher initialization self.fetcher = Fetcher(proxy=proxy) self.fetcher.headers["Authorization"] = self.auth # Logging initialization self.logger = logging.getLogger("webmining:bingapi_fetcher") self.logger.setLevel(logging.INFO) self.wrapper = HTML5Wrapper() def parse(self, webpage, bresults): # We check out API account is not empty. This is tricky, as in this case the Bing API # speaks in plain-text and no more in json. if webpage.strip( ) == "Insufficient balance for the subscribed offer in user's account": raise EmptyBingAPIAccount( "Insufficient balance for the subscribed offer in user's account" ) json_result = json.loads(webpage, encoding="utf-8") result_list = json_result['d']['results'] if webpage is not None: for r in result_list: br = BingAPIResult(r) bresults.append(br) return webpage def fetch(self, q, start=0, country="FR"): """ Fetches Bing with the query q and sends back a list of results. param: q: a query, as a string param: start: the starting offset (first 50 results are start=0, next 50 start=1, ...) param: country of the searched company return: a list of BingAPIResult """ bresults = [] # Simple quote parasite bing query parser q = q.replace("'", "") query = "'%s" % q query += "'" query = urllib.parse.urlencode({ "Query": query, "$top": "50", '$skip': "%i" % (start * 50), 'Market': "'%s'" % self.markets[country], 'Options': "'DisableLocationDetection'", }) url = self.base_url + query fr = self.fetcher.fetch(url, debug=False, force_encoding="utf-8") self.logger.debug("Fetched url [%s] start=%s" % (url, start)) if fr is None or fr.webpage is None: self.logger.warn("Got nothing from [%s]" % url) return bresults self.logger.debug("Returned result - " + str(fr.fetched_url)) self.parse(fr.webpage, bresults) self.logger.info("Fetched [%s] with %d results" % (url, len(bresults))) return bresults
def __init__(self, tld="fr", proxy=None): self.fetcher = Fetcher(proxy=proxy) self.wrapper = HTML5Wrapper() self.base_url = "http://www.google.%s/search?rls=en&ie=UTF-8&oe=UTF-8&" % tld
class Crawler: """ A generic crawler. """ def __init__(self, filename=None, seedlist=None, debug=False, proxy=None, multiproc=True, mode=CrawlMode.entire, max_page_size=PAGE_SIZE_LIMIT): """ :param filename: path to the seed file :param mode: crawling mode, either "entire", "single", "subpath" """ self.seed = None self.debug = debug # init the fetcher with a download limit size self.fetcher = Fetcher(proxy, max_page_size=max_page_size) self.htmltools = HTML5Wrapper() self.crawl_depth = 0 # Do we crawl domains outside the seed self.domain_depth = 0 # At which depth each seed element must be crawled self.page_limit = 0 # Max amount of pages to be crawled self.max_page_size = max_page_size self.website = Website() self.me = MetaExtractor(proxy=proxy) self.badextensions = set(["pdf", "xls", "doc", "ppt", "rtf", "odt", "zip", "tar.gz", "tar", "exe", \ "jpg", "png", "jpeg", "bmp", "gif", "mp3", "flv", "rar", "ogv", "avi", "mp4", \ "mkg", "ps", "ogg", "webm", "ogm", "pps", "pptx", "docx", "xlsx", "mpg", "mov", \ "mkv", "mpeg", "m4v", "iso"]) self.crawling_process_over = False # Logging initialization self.logger = logging.getLogger("webmining:crawler") self.logger.setLevel(logging.INFO) if debug: self.logger.setLevel(logging.DEBUG) self.filename = filename self.seedlist = seedlist self.mode = mode self.authorized_domains = set() def _monitore_processes(self, processes): """ Checks if subcrawling processes are over. This method is meant to be used wrapped into a Thread. """ for p in processes: p["event"].wait() self.crawling_process_over = True def spawn_crawl_processes(self, html2txt, metas, proc, wait_courtesy): processes = [] for i in range(0, proc): e = Event() p = Process(None, self._sub_crawl, None, (), {"queue": self.seed.q, "storage": self.storage, "end_event": e, \ "wait": wait_courtesy, "html2txt": html2txt, "metas": metas}) p.start() processes.append({"proc": p, "event": e, "id": i}) monitor = Thread(group=None, target=self._monitore_processes, name=None, args=(), kwargs={"processes": processes}) monitor.start() while not self.crawling_process_over: # If all processes are over, or if getting an element # from queue takes more than timeout seconds (which seems empirically abnormal) # then crawl is finished. c = 0 for p in processes: if not p["proc"].is_alive(): c += 1 if c >= len(processes): self.logger.warning("All processes are dead !") break try: el = self.storage.get(block=True, timeout=5) yield el except Empty: if self.storage.empty(): pass self.logger.debug("joining processes...") for p in processes: if p["proc"].is_alive(): p["proc"].terminate() p["proc"].join() # Finally, joining monitoring thread monitor.join(3) if monitor.is_alive(): monitor._stop() def crawl(self, proc=None, domain_depth=0, crawl_depth=0, page_limit=None, wait_courtesy=0, html2txt=False, metas=None): """ :param proc: amount of processes to spawn, 0 or None can be used to exploit the current process :param domain_depth: crawling depth for each seed element (inside original domain) :param crawl_depth: crawling depth for each seed element (outside original domain) :param page_limit: max amount of page to crawl :param wait_courtesy: time in second between each fetch :param html2txt: resulting pages must be raw html (default), or cleant txt :param metas: metas we want to extract during crawling """ self.domain_depth = domain_depth self.crawl_depth = crawl_depth self.page_limit = page_limit # lazy loading, to know if we need to implement seeds with multiproc or not if self.seed is None: if self.filename is not None: self.seed = Seed(f=self.filename, multiproc=not (proc is None or proc == 0)) elif self.seedlist is not None: self.seed = Seed(s=self.seedlist, multiproc=not (proc is None or proc == 0)) if proc is None or proc == 0: self.storage = Queue() # Will contain shared crawl results self._sub_crawl(self.seed.q, self.storage, Event(), wait_courtesy, html2txt, metas, None) while True: try: el = self.storage.get(block=False) yield el except Empty: break else: self.storage = MPQueue() # Will contain shared crawl results yield from self.spawn_crawl_processes(html2txt, metas, proc, wait_courtesy) def _sub_crawl(self, queue, storage, end_event, wait, html2txt, metas, block_timeout=5): """ This private method will be wrapped into a process, and is in charge of dequeuing seed elements, and recording results into the storage. """ while True: se = None pages = [] try: se = queue.get(block=block_timeout is not None, timeout=block_timeout) except Empty: end_event.set() return self.logger.info("Launched crawl [%s]" % se.url) start_url = se.url # Need to keep it as it may change due to redirect pages = self.crawl_domain(se, self.domain_depth, wait, html2txt, self.page_limit, self.mode) self.logger.info("Crawl over with %d pages [%s]" % (len(pages), (se.url if start_url in se.url else '%s -> %s' % (start_url, se.url)))) first = True for url in pages: se = pages[url] ext_metas = {} # Extract asked metas from page if metas is not None: try: ext_metas = self.me.extract(metas, se.html, se.relevant_txt, \ url=url, firstpage=first) first = False except MetaExtractionException as e: self.logger.warning( "Impossible to extract metas in [%s]: " % url) self.logger.warning(e) continue for m in ext_metas: if ext_metas[m] is not None: if m not in se.metas.keys(): if m in ["contact", "phone", "fax"]: se.metas[m] = [] else: se.metas[m] = set() if m in ["contact", "phone", "fax"]: se.metas[m].extend(ext_metas[m]) else: se.metas[m].add(ext_metas[m]) storage.put(se) # Let's save memory del pages if self.crawl_depth > 0: # TODO: create new seed elements to put in queue when crawl deeper than 0 # with an updated depth, domain, etc... raise Exception("Not implemented") def _check_first_page(self, dom, url): """ Checks if domain first page is - a html redirection - a frameset returns an url to follow, or None if nothing detected. """ # we check out if it contains a <meta http-equiv="refresh" # ex. <meta http-equiv="Refresh" content="0; URL=corporate-finance/corporate-finance-presentation.html"> metas = dom( "meta[http-equiv='refresh'][content], meta[http-equiv='Refresh'][content], meta[http-equiv='REFRESH'][content]" ) #raise Exception("type of metas : " + str(type(metas)) + "\n" + str(dir(metas))) base_url = self._get_base_url(dom, url) for m in metas.items(): content = m.attr.content m = re.search("url\s?=\s?(.*?)\s", content + ' ', flags=re.I) if m is not None: rurl = m.group(1).strip() rurl = urllib.parse.urljoin(base_url, rurl) self.logger.info("HTTP redirection to [%s]" % rurl) return rurl # We check out if it contains a <frame src="..." # and only return first found url if true # TODO: is it relevant to return only the first frame? frames = dom("frame[src]") for f in frames.items(): rurl = urllib.parse.urljoin(base_url, f.attr.src) self.logger.info("FRAME redirection to [%s]" % rurl) return rurl # We check out if it contains a JS redirection document.location.href= # and only return first found url if true scripts = dom("script") for s in scripts.items(): js = s.text() if js is not None: m = re.search( "document.location.href\s?=\s?[\"']([^\"]*?)[\"']\s*[^+]", js + " ", flags=re.I) if m is not None: rurl = urllib.parse.urljoin(base_url, m.group(1).strip()) self.logger.info("JavaScript redirection to [%s]" % rurl) return rurl return None def _verify_and_parse_result(self, fresult, seed_el): """ Verify if a fetch result is valid for parsing. If so, it will build the pq element that correspond to the webpage :param fresult: FetchResult object :param seed_el: SeedElement object :return: The pq element that correspond """ if fresult is None: return None html = fresult.webpage content_type = fresult.content_type # in case of 300/302 we use final url given by fetcher seed_el.url = fresult.fetched_url if fresult.http_status is None or fresult.http_status != 200: self.logger.warning("Bad HTTP Status (%s) for [%s]" % (str(fresult.http_status), seed_el.url)) return None if html is None: self.logger.warning("Impossible to crawl [%s]" % seed_el.url) # Missed page not ignored, as this kind of websites can be dangerous return None # We only want to compute text/html webpages if content_type is not None and "text/html" not in content_type.lower( ): self.logger.info("Content Type ignored : " + str(content_type) + " [" + seed_el.url + "]") return None # Too large file self.logger.debug("Page size of %d characters" % len(html)) if len(html) > self.max_page_size: self.logger.warning("Page ignored, too big (%d characters) in %s" % (len(html), seed_el.url)) return None # Is an attachment, so we must ignore it if fresult.attachment is not None: self.logger.warning( "Page ignored, because it correspond to the attachment %s [%s]" % (fresult.attachment, seed_el.url)) return None if len(html) == 0: self.logger.warning("Page ignored because it is empty [%s]" % seed_el.url) return None try: dom = self.htmltools.pq(html) except Exception as e: self.logger.warning("Impossible to parse html url=%s : %s" % (fresult.fetched_url, str(e))) return None # DEACTIVATED FEATURE # Test to see if the root node is a html node # if dom[0].tag.lower() != 'html': # self.logger.warning("Page is not a valid html [%s]" % seed_el.url) # return None return dom @staticmethod def _generate_authorized_domains(domain): domain = domain.lower() # Force lower case auth = set([domain]) if "www." in domain: auth.add(domain.replace("www.", "")) else: auth.add("www." + domain) comdom = { dom.rsplit(".", maxsplit=1)[0] + ".com" for dom in auth if ".com" not in dom } auth.update(comdom) return auth def _is_authorized_subpath(self, init_url, target_url): # Force Lower case init_url = init_url.lower() if init_url is not None else init_url target_url = target_url.lower( ) if target_url is not None else target_url init_path = urllib.parse.urlparse(init_url).path target_url_parsed = urllib.parse.urlparse(target_url) target_domain, target_path = target_url_parsed.netloc, target_url_parsed.path if target_domain in self.authorized_domains and target_path.startswith( init_path): return True return False def crawl_domain(self, init_seed_el, max_dom_depth, wait, html2txt, limit=None, mode=CrawlMode.entire): """ Fetches a domain, and then crawls its internal pages until given depth. Returns a dictionary of url -> html code. """ pages = {} visited = set() # Already visited URLs found_links = [ init_seed_el ] # List of found links as SeedElements, waiting to be fetched #overides the limit to crawl only one page if mode == CrawlMode.single: limit = 1 max_dom_depth = 1 self.logger.info("Launching crawl in the %s mode" % mode.value) # -- Managing authorized domains for this crawl -- domain = urllib.parse.urlparse(init_seed_el.url).netloc self.authorized_domains = self._generate_authorized_domains(domain) self.logger.info("Authorized domains for this crawl : %s" % str(self.authorized_domains)) # Looping through found urls while True: if limit is not None and len(visited) > limit: self.logger.info("Max amount of pages reached ! (%d)" % limit) return pages self.logger.debug("%d url visited so far" % len(visited)) seed_el = None # Current element being computed, in while loop try: while True: seed_el = found_links.pop(0) if seed_el.url not in visited: break visited.add( seed_el.url) # A popped element is considered visited except IndexError: self.logger.info("No more links to visit for this website.") return pages # Fetching URL given in seed element in param self.logger.debug("Fetching " + seed_el.url) fresult = None retry = 0 max_retry = 2 # TODO - VYS - Make this configurable while fresult is None and retry <= max_retry: try: fresult = self.fetcher.fetch(seed_el.url, self.debug, timeout=10) # If we're here it means that no more retry are needed, disable it retry = max_retry + 1 except Timeout: self.logger.warning( "Timeout while fetching %s%s" % (seed_el.url, (", lets retry (max retry %s)" % max_retry) if retry == 0 else (" - retry %s/%s" % (retry, max_retry)))) retry += 1 continue if fresult is None: continue if wait > 0: time.sleep(wait) # Lets do a quick check if we don't get a redirect rurl30X = None if fresult.fetched_url != seed_el.url: rurl30X = fresult.fetched_url self.logger.warning("Got a redirect to %s when fetching %s" % (fresult.fetched_url, seed_el.url)) dom = self._verify_and_parse_result(fresult, seed_el) if dom is None: self.logger.warning("Found no DOM for %s" % seed_el.url) continue # normalize root urls to avoid a double visit at http://www.example.com/ and http://www.example.com path = urllib.parse.urlparse(seed_el.url).path if path == '': seed_el.url += '/' self.logger.debug("Fetched [%s] " % seed_el.url) # If this page is the first one for this domain, # we check out if it contains a <meta http-equiv="refresh" # The same if this page is the second one, # because sometimes a redirection is followed by a frame if len(visited) < 2: rurl = self._check_first_page(dom, seed_el.url) rurl = rurl if rurl is not None else rurl30X if rurl is not None: domain = urllib.parse.urlparse(rurl).netloc domain = domain.lower() # If we are following a redirect, we also add it to the set of authorized domains # to be able to follow next urls. self.authorized_domains.add(domain) if "www." in domain: self.authorized_domains.add(domain.replace("www.", "")) else: self.authorized_domains.add("www." + domain) self.logger.info( "New authorized domains for this crawl : %s" % str(self.authorized_domains)) if seed_el.url in visited: pass else: visited.add(seed_el.url) # Adding detected url to follow ser = SeedElement(rurl, seed_el.groupid) ser.depth = seed_el.depth + 1 found_links.append(ser) # If the new page url, after redirections, is outside authorized domains, don't use it if urllib.parse.urlparse( seed_el.url).netloc.lower() not in self.authorized_domains: self.logger.warning( "redirection to %s don't exits from authorized domains, page not analyzed" % seed_el.url) continue if mode == CrawlMode.subpath and not self._is_authorized_subpath( init_seed_el.url, seed_el.url): self.logger.warning( "subpath mode: redirection to %s exists from authorized subpaths, page not analyzed" % seed_el.url) continue # --- # HTML computing # --- # Converting html into "clean" and interesting text relevant_txt = self.website.extract_meaningful_text(dom) # Builds a new Seed Element from popped element se = SeedElement(seed_el.url, seed_el.groupid) se.depth = seed_el.depth se.relevant_txt = relevant_txt if fresult is not None: se.html = fresult.webpage se.content_type = fresult.content_type se.charset = fresult.charset se.http_status = fresult.http_status se.headers = fresult.headers # Sometimes DOM is too deep to extract title properly se.title = self.website.extract_title(dom) pages[seed_el.url] = se visited.add( seed_el.url ) # May be different from original, cause of redirections # This page has been computed, let's now extract its links # if ymax depth not reached if seed_el.depth + 1 > max_dom_depth: continue if mode != CrawlMode.single: found_links.extend( self._extract_links(dom, init_seed_el, seed_el, visited, mode)) self.logger.debug("Out of while loop.") return pages def _get_base_url(self, dom, url): # check if there is a 'base' tag for link compute base_url = dom('base').attr('href') if base_url is None: base_url = url return base_url def _extract_links(self, dom, init_seed_el, seed_el, visited, mode): """ Given a dom, extract internal links to crawl """ # --- # Link extraction and checking # --- links = {} selected_links = [] added = set() # DOM is sometimes to deep to extract links properly try: links = self.htmltools.extract_doc_links(dom) except Exception as e: links = {} self.logger.warning("Impossible to extract links from %s : %s" % (seed_el.url, str(e))) base_url = self._get_base_url(dom, seed_el.url) for key in links: # We do not want anchors to be crawled key = key.split("#")[0] if len(key) < 1: continue url = None try: url = urllib.parse.urljoin(base_url, key) except Exception as e: # Invalid url, ignoring self.logger.warning("Invalid urljoin (%s,%s): %s" % (base_url, key, str(e))) continue # Trying to get eventual file extension, and to check its validity path = urllib.parse.urlparse(url).path if path == '': url += '/' else: ext = path.split('.')[-1].strip().lower() if ext in self.badextensions: self.logger.debug("Bad extension [%s] in %s" % (ext, url)) continue # Let's check if it's an internal link, and not an outgoing one if urllib.parse.urlparse(url).netloc.lower() in self.authorized_domains and \ url not in visited and url not in added: if mode == CrawlMode.subpath and not self._is_authorized_subpath( init_seed_el.url, url): continue se = SeedElement(url, seed_el.groupid) se.depth = seed_el.depth + 1 selected_links.append(se) added.add(url) return selected_links