def __init__(self, url, config=None, extractor=None, **kwargs): if (url is None) or ('://' not in url) or (url[:4] != 'http'): raise ValueError('Input url is bad!') self.config = config self.config = utils.extend_config(self.config, kwargs) self.extractor = extractor self.url = url self.url = urls.prepare_url(url) self.domain = urls.get_domain(self.url) self.scheme = urls.get_scheme(self.url) self.categories = [] self.feeds = [] self.articles = [] self.html = '' self.doc = None self.logo_url = '' self.favicon = '' self.brand = 'datahub' self.description = '' self.is_parsed = False self.is_downloaded = False
def __init__(self, url, config=None, **kwargs): """The config object for this source will be passed into all of this source's children articles unless specified otherwise or re-set. """ if (url is None) or ('://' not in url) or (url[:4] != 'http'): raise Exception('Input url is bad!') self.config = config or Configuration() self.config = utils.extend_config(self.config, kwargs) self.extractor = IntiExtractor(self.config) self.url = url self.url = urls.prepare_url(url) self.domain = urls.get_domain(self.url) self.scheme = urls.get_scheme(self.url) self.categories = [] self.feeds = [] self.articles = [] self.html = '' self.doc = None self.logo_url = '' self.favicon = '' self.brand = tldextract.extract(self.url).domain self.description = '' self.is_parsed = False self.is_downloaded = False
def get_base_domain(url): """For example, the base url of uk.reuters.com => reuters.com """ domain = get_domain(url) tld = '.'.join(domain.split('.')[-2:]) if tld in ['co.uk', 'com.au', 'au.com']: # edge cases end_chunks = domain.split('.')[-3:] else: end_chunks = domain.split('.')[-2:] base_domain = '.'.join(end_chunks) return base_domain
def get_base_domain(url): # Checks URL domains domain = get_domain(url) tld = '.'.join(domain.split('.')[-2:]) if tld in ['co.uk', 'com.au', 'au.com']: end_chunks = domain.split('.')[-3:] else: end_chunks = domain.split('.')[-2:] base_domain = '.'.join(end_chunks) return base_domain
def get_category_urls(self, source_url, doc): """Inputs source lxml root and source url, extracts domain and finds all of the top level urls, we are assuming that these are the category urls. cnn.com --> [cnn.com/latest, world.cnn.com, cnn.com/asia] """ page_urls = self.get_urls(doc) valid_categories = [] for p_url in page_urls: scheme = urls.get_scheme(p_url, allow_fragments=False) domain = urls.get_domain(p_url, allow_fragments=False) path = urls.get_path(p_url, allow_fragments=False) if not domain and not path: if self.config.verbose: print('elim category url %s for no domain and path' % p_url) continue if path and path.startswith('#'): if self.config.verbose: print('elim category url %s path starts with #' % p_url) continue if scheme and (scheme != 'http' and scheme != 'https'): if self.config.verbose: print(('elim category url %s for bad scheme, ' 'not http nor https' % p_url)) continue if domain: child_tld = tldextract.extract(p_url) domain_tld = tldextract.extract(source_url) child_subdomain_parts = child_tld.subdomain.split('.') subdomain_contains = False for part in child_subdomain_parts: if part == domain_tld.domain: if self.config.verbose: print(('subdomain contains at %s and %s' % (str(part), str(domain_tld.domain)))) subdomain_contains = True break # Ex. microsoft.com is definitely not related to # espn.com, but espn.go.com is probably related to espn.com if not subdomain_contains and \ (child_tld.domain != domain_tld.domain): if self.config.verbose: print(('elim category url %s for domain ' 'mismatch' % p_url)) continue elif child_tld.subdomain in ['m', 'i']: if self.config.verbose: print(('elim category url %s for mobile ' 'subdomain' % p_url)) continue else: valid_categories.append(scheme+'://'+domain) # TODO account for case where category is in form # http://subdomain.domain.tld/category/ <-- still legal! stopwords = [ 'about', 'help', 'privacy', 'legal', 'feedback', 'sitemap', 'profile', 'account', 'mobile', 'sitemap', 'facebook', 'myspace', 'twitter', 'linkedin', 'bebo', 'friendster', 'stumbleupon', 'youtube', 'vimeo', 'store', 'mail', 'preferences', 'maps', 'password', 'imgur', 'flickr', 'search', 'subscription', 'itunes', 'siteindex', 'events', 'stop', 'jobs', 'careers', 'newsletter', 'subscribe', 'academy', 'shopping', 'purchase', 'site-map', 'shop', 'donate', 'newsletter', 'product', 'advert', 'info', 'tickets', 'coupons', 'forum', 'board', 'archive', 'browse', 'howto', 'how to', 'faq', 'terms', 'charts', 'services', 'contact', 'plus', 'admin', 'login', 'signup', 'register', 'developer', 'proxy'] _valid_categories = [] # TODO Stop spamming urlparse and tldextract calls... for p_url in valid_categories: path = urls.get_path(p_url) subdomain = tldextract.extract(p_url).subdomain conjunction = path + ' ' + subdomain bad = False for badword in stopwords: if badword.lower() in conjunction.lower(): if self.config.verbose: print(('elim category url %s for subdomain ' 'contain stopword!' % p_url)) bad = True break if not bad: _valid_categories.append(p_url) _valid_categories.append('/') # add the root for i, p_url in enumerate(_valid_categories): if p_url.startswith('://'): p_url = 'http' + p_url _valid_categories[i] = p_url elif p_url.startswith('//'): p_url = 'http:' + p_url _valid_categories[i] = p_url if p_url.endswith('/'): p_url = p_url[:-1] _valid_categories[i] = p_url _valid_categories = list(set(_valid_categories)) category_urls = [urls.prepare_url(p_url, source_url) for p_url in _valid_categories] category_urls = [c for c in category_urls if c is not None] return category_urls