def sitemap_tree_for_homepage(homepage_url: str) -> AbstractSitemap: """Using a homepage URL, fetch the tree of sitemaps and its stories.""" if not is_http_url(homepage_url): raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(homepage_url)) try: url = normalize_url(homepage_url) except Exception as ex: raise McSitemapsException("Unable to normalize URL {}: {}".format(homepage_url, ex)) try: uri = furl(url) except Exception as ex: raise McSitemapsException("Unable to parse URL {}: {}".format(url, ex)) if not is_homepage_url(homepage_url): try: uri = uri.remove(path=True, query=True, query_params=True, fragment=True) log.warning("Assuming that the homepage of {} is {}".format(homepage_url, uri.url)) except Exception as ex: raise McSitemapsException("Unable to determine homepage URL for URL {}: {}".format(homepage_url, ex)) uri.path = '/robots.txt' robots_txt_url = str(uri.url) robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, recursion_level=0) sitemap_tree = robots_txt_fetcher.sitemap() return sitemap_tree
def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None): if recursion_level > self.__MAX_RECURSION_LEVEL: raise McSitemapsException( "Recursion level exceeded {} for URL {}.".format( self.__MAX_RECURSION_LEVEL, url)) url = fix_common_url_mistakes(url) if not is_http_url(url): raise McSitemapsException( "URL {} is not a HTTP(s) URL.".format(url)) try: url = normalize_url(url) except Exception as ex: raise McSitemapsException("Unable to normalize URL {}: {}".format( url, ex)) if not ua: ua = sitemap_useragent() self._url = url self._ua = ua self._recursion_level = recursion_level
def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None): if recursion_level > self.__MAX_RECURSION_LEVEL: raise McSitemapsException("Recursion level exceeded {} for URL {}.".format(self.__MAX_RECURSION_LEVEL, url)) url = fix_common_url_mistakes(url) if not is_http_url(url): raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(url)) try: url = normalize_url(url) except Exception as ex: raise McSitemapsException("Unable to normalize URL {}: {}".format(url, ex)) if not ua: ua = sitemap_useragent() self._url = url self._ua = ua self._recursion_level = recursion_level
def test_normalize_url(): # Bad URLs with pytest.raises(mc_url.McNormalizeURLException): # noinspection PyTypeChecker mc_url.normalize_url(None) with pytest.raises(mc_url.McNormalizeURLException): mc_url.normalize_url('gopher://gopher.floodgap.com/0/v2/vstat') # Basic # (No urls_are_equal() because we want to compare them as strings here) assert mc_url.normalize_url('HTTP://CYBER.LAW.HARVARD.EDU:80/node/9244' ) == 'http://cyber.law.harvard.edu/node/9244' assert mc_url.normalize_url( 'HTTP://WWW.GOCRICKET.COM/news/sourav-ganguly/Sourav-Ganguly-exclusive-MS-Dhoni-must-reinvent-himself' '-to-survive/articleshow_sg/40421328.cms?utm_source=facebook.com&utm_medium=referral' ) == 'http://www.gocricket.com/news/sourav-ganguly/Sourav-Ganguly-exclusive-MS-Dhoni-must-reinvent-himself-to-' \ 'survive/articleshow_sg/40421328.cms' # Multiple fragments assert mc_url.normalize_url( 'HTTP://CYBER.LAW.HARVARD.EDU/node/9244#foo#bar' ) == 'http://cyber.law.harvard.edu/node/9244' # URL in query assert mc_url.normalize_url( 'http://bash.org/?244321') == 'http://bash.org/?244321' # Broken URL assert mc_url.normalize_url('http://http://www.al-monitor.com/pulse' ) == 'http://www.al-monitor.com/pulse' # Empty parameter assert mc_url.normalize_url( 'http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html?=_r%3D6' ) == 'http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html' # Remove whitespace assert mc_url.normalize_url( ' http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html ' ) == 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html' assert mc_url.normalize_url( "\t\thttp://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html\t\t" ) == 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html' # NYTimes assert mc_url.normalize_url( 'http://boss.blogs.nytimes.com/2014/08/19/why-i-do-all-of-my-recruiting-through-linkedin/' '?smid=fb-nytimes&WT.z_sma=BU_WID_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000' '&bicmet=1420088400000&_' ) == 'http://boss.blogs.nytimes.com/2014/08/19/why-i-do-all-of-my-recruiting-through-linkedin/' assert mc_url.normalize_url( 'http://www.nytimes.com/2014/08/19/upshot/inequality-and-web-search-trends.html?smid=fb-nytimes&' 'WT.z_sma=UP_IOA_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000&bicmet=1420088400000&_r=1&' 'abt=0002&abg=1' ) == 'http://www.nytimes.com/2014/08/19/upshot/inequality-and-web-search-trends.html' assert mc_url.normalize_url( 'http://www.nytimes.com/2014/08/20/upshot/data-on-transfer-of-military-gear-to-police-departments.html' '?smid=fb-nytimes&WT.z_sma=UP_DOT_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000&' 'bicmet=1420088400000&_r=1&abt=0002&abg=1' ) == 'http://www.nytimes.com/2014/08/20/upshot/data-on-transfer-of-military-gear-to-police-departments.html' # Facebook assert mc_url.normalize_url( 'https://www.facebook.com/BerkmanCenter?ref=br_tf' ) == 'https://www.facebook.com/BerkmanCenter' # LiveJournal assert mc_url.normalize_url( 'http://zyalt.livejournal.com/1178735.html?thread=396696687#t396696687' ) == 'http://zyalt.livejournal.com/1178735.html' # "nk" parameter assert mc_url.normalize_url( 'http://www.adelaidenow.com.au/news/south-australia/sa-court-told-prominent-adelaide-businessman-yasser' '-shahin-was-assaulted-by-police-officer-norman-hoy-in-september-2010-traffic-stop/story-fni6uo1m-' '1227184460050?nk=440cd48fd95a4e1f1c23bcd15df36da7' ) == ( 'http://www.adelaidenow.com.au/news/south-australia/sa-court-told-prominent-adelaide-businessman-yasser-' 'shahin-was-assaulted-by-police-officer-norman-hoy-in-september-2010-traffic-stop/story-fni6uo1m-' '1227184460050')
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]: """Given the URL, return all URL variants that we can think of: 1) Normal URL (the one passed as a parameter) 2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere) 3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.) 4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL) 5) URL from <link rel="canonical" /> (if any) 6) Any alternative URLs from topic_merged_stories or topic_links""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McAllURLVariantsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): log.warning("URL %s is not a valid HTTP URL." % url) return [ url, ] # Get URL after HTTP / HTML redirects ua = UserAgent() response = ua.get_follow_http_html_redirects(url) url_after_redirects = response.request().url() data_after_redirects = response.decoded_content() urls = { # Normal URL (don't touch anything) 'normal': url, # Normal URL after redirects 'after_redirects': url_after_redirects, # Canonical URL 'normalized': normalize_url(url), # Canonical URL after redirects 'after_redirects_normalized': normalize_url(url_after_redirects), } # If <link rel="canonical" /> is present, try that one too if data_after_redirects is not None: url_link_rel_canonical = link_canonical_url_from_html( html=data_after_redirects, base_url=url_after_redirects) if url_link_rel_canonical is not None and len( url_link_rel_canonical) > 0: log.debug( ('Found <link rel="canonical" /> for URL %(url_after_redirects)s ' '(original URL: %(url)s): %(url_link_rel_canonical)s') % { "url_after_redirects": url_after_redirects, "url": url, "url_link_rel_canonical": url_link_rel_canonical, }) urls['after_redirects_canonical'] = url_link_rel_canonical # If URL gets redirected to the homepage (e.g. # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads # to http://www.wired.com/), don't use those redirects if not is_homepage_url(url): urls = { key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key]) } distinct_urls = list(set(urls.values())) topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls) distinct_urls = distinct_urls + topic_urls distinct_urls = list(set(distinct_urls)) # Remove URLs that can't be variants of the initial URL for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES: distinct_urls = [ x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x) ] return distinct_urls
def test_normalize_url(): # Bad URLs with pytest.raises(mc_url.McNormalizeURLException): # noinspection PyTypeChecker mc_url.normalize_url(None) with pytest.raises(mc_url.McNormalizeURLException): mc_url.normalize_url('gopher://gopher.floodgap.com/0/v2/vstat') # Basic # (No urls_are_equal() because we want to compare them as strings here) assert mc_url.normalize_url('HTTP://CYBER.LAW.HARVARD.EDU:80/node/9244') == 'http://cyber.law.harvard.edu/node/9244' assert mc_url.normalize_url( 'HTTP://WWW.GOCRICKET.COM/news/sourav-ganguly/Sourav-Ganguly-exclusive-MS-Dhoni-must-reinvent-himself' '-to-survive/articleshow_sg/40421328.cms?utm_source=facebook.com&utm_medium=referral' ) == 'http://www.gocricket.com/news/sourav-ganguly/Sourav-Ganguly-exclusive-MS-Dhoni-must-reinvent-himself-to-' \ 'survive/articleshow_sg/40421328.cms' # Multiple fragments assert mc_url.normalize_url( 'HTTP://CYBER.LAW.HARVARD.EDU/node/9244#foo#bar' ) == 'http://cyber.law.harvard.edu/node/9244' # URL in query assert mc_url.normalize_url('http://bash.org/?244321') == 'http://bash.org/?244321' # Broken URL assert mc_url.normalize_url('http://http://www.al-monitor.com/pulse') == 'http://www.al-monitor.com/pulse' # Empty parameter assert mc_url.normalize_url( 'http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html?=_r%3D6' ) == 'http://www-nc.nytimes.com/2011/06/29/us/politics/29marriage.html' # Remove whitespace assert mc_url.normalize_url( ' http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html ' ) == 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html' assert mc_url.normalize_url( "\t\thttp://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html\t\t" ) == 'http://blogs.perl.org/users/domm/2010/11/posting-utf8-data-using-lwpuseragent.html' # NYTimes assert mc_url.normalize_url( 'http://boss.blogs.nytimes.com/2014/08/19/why-i-do-all-of-my-recruiting-through-linkedin/' '?smid=fb-nytimes&WT.z_sma=BU_WID_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000' '&bicmet=1420088400000&_' ) == 'http://boss.blogs.nytimes.com/2014/08/19/why-i-do-all-of-my-recruiting-through-linkedin/' assert mc_url.normalize_url( 'http://www.nytimes.com/2014/08/19/upshot/inequality-and-web-search-trends.html?smid=fb-nytimes&' 'WT.z_sma=UP_IOA_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000&bicmet=1420088400000&_r=1&' 'abt=0002&abg=1' ) == 'http://www.nytimes.com/2014/08/19/upshot/inequality-and-web-search-trends.html' assert mc_url.normalize_url( 'http://www.nytimes.com/2014/08/20/upshot/data-on-transfer-of-military-gear-to-police-departments.html' '?smid=fb-nytimes&WT.z_sma=UP_DOT_20140819&bicmp=AD&bicmlukp=WT.mc_id&bicmst=1388552400000&' 'bicmet=1420088400000&_r=1&abt=0002&abg=1' ) == 'http://www.nytimes.com/2014/08/20/upshot/data-on-transfer-of-military-gear-to-police-departments.html' # Facebook assert mc_url.normalize_url( 'https://www.facebook.com/BerkmanCenter?ref=br_tf') == 'https://www.facebook.com/BerkmanCenter' # LiveJournal assert mc_url.normalize_url( 'http://zyalt.livejournal.com/1178735.html?thread=396696687#t396696687' ) == 'http://zyalt.livejournal.com/1178735.html' # "nk" parameter assert mc_url.normalize_url( 'http://www.adelaidenow.com.au/news/south-australia/sa-court-told-prominent-adelaide-businessman-yasser' '-shahin-was-assaulted-by-police-officer-norman-hoy-in-september-2010-traffic-stop/story-fni6uo1m-' '1227184460050?nk=440cd48fd95a4e1f1c23bcd15df36da7' ) == ('http://www.adelaidenow.com.au/news/south-australia/sa-court-told-prominent-adelaide-businessman-yasser-' 'shahin-was-assaulted-by-police-officer-norman-hoy-in-september-2010-traffic-stop/story-fni6uo1m-' '1227184460050')
def page(self) -> Optional[SitemapPage]: """Return constructed sitemap page if one has been completed, otherwise None.""" # Required url = html_unescape_strip(self.url) if not url: log.error("URL is unset") return None try: url = normalize_url(url) except Exception as ex: log.error("Unable to normalize URL {}: {}".format(url, ex)) return None last_modified = html_unescape_strip(self.last_modified) if last_modified: last_modified = parse_sitemap_publication_date(last_modified) change_frequency = html_unescape_strip(self.change_frequency) if change_frequency: change_frequency = SitemapPageChangeFrequency( change_frequency.lower()) assert isinstance(change_frequency, SitemapPageChangeFrequency) priority = html_unescape_strip(self.priority) if priority: priority = Decimal(priority) comp_zero = priority.compare(Decimal('0.0')) comp_one = priority.compare(Decimal('1.0')) if comp_zero in (Decimal('0'), Decimal('1') and comp_one in (Decimal('0'), Decimal('-1'))): # 0 <= priority <= 1 pass else: log.warning( "Priority is not within 0 and 1: {}".format(priority)) priority = SITEMAP_PAGE_DEFAULT_PRIORITY else: priority = SITEMAP_PAGE_DEFAULT_PRIORITY news_title = html_unescape_strip(self.news_title) news_publish_date = html_unescape_strip(self.news_publish_date) if news_publish_date: news_publish_date = parse_sitemap_publication_date( date_string=news_publish_date) news_publication_name = html_unescape_strip( self.news_publication_name) news_publication_language = html_unescape_strip( self.news_publication_language) news_access = html_unescape_strip(self.news_access) news_genres = html_unescape_strip(self.news_genres) if news_genres: news_genres = [x.strip() for x in news_genres.split(',')] else: news_genres = [] news_keywords = html_unescape_strip(self.news_keywords) if news_keywords: news_keywords = [x.strip() for x in news_keywords.split(',')] else: news_keywords = [] news_stock_tickers = html_unescape_strip(self.news_stock_tickers) if news_stock_tickers: news_stock_tickers = [ x.strip() for x in news_stock_tickers.split(',') ] else: news_stock_tickers = [] sitemap_news_story = None if news_title and news_publish_date: sitemap_news_story = SitemapNewsStory( title=news_title, publish_date=news_publish_date, publication_name=news_publication_name, publication_language=news_publication_language, access=news_access, genres=news_genres, keywords=news_keywords, stock_tickers=news_stock_tickers, ) return SitemapPage( url=url, last_modified=last_modified, change_frequency=change_frequency, priority=priority, news_story=sitemap_news_story, )
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]: """Given the URL, return all URL variants that we can think of: 1) Normal URL (the one passed as a parameter) 2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere) 3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.) 4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL) 5) URL from <link rel="canonical" /> (if any) 6) Any alternative URLs from topic_merged_stories or topic_links""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McAllURLVariantsException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): log.warning("URL %s is not a valid HTTP URL." % url) return [ url, ] # Get URL after HTTP / HTML redirects ua = UserAgent() response = ua.get_follow_http_html_redirects(url) url_after_redirects = response.request().url() data_after_redirects = response.decoded_content() urls = { # Normal URL (don't touch anything) 'normal': url, # Normal URL after redirects 'after_redirects': url_after_redirects, # Canonical URL 'normalized': normalize_url(url), # Canonical URL after redirects 'after_redirects_normalized': normalize_url(url_after_redirects), } # If <link rel="canonical" /> is present, try that one too if data_after_redirects is not None: url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects) if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0: log.debug( ( 'Found <link rel="canonical" /> for URL %(url_after_redirects)s ' '(original URL: %(url)s): %(url_link_rel_canonical)s' ) % { "url_after_redirects": url_after_redirects, "url": url, "url_link_rel_canonical": url_link_rel_canonical, } ) urls['after_redirects_canonical'] = url_link_rel_canonical # If URL gets redirected to the homepage (e.g. # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads # to http://www.wired.com/), don't use those redirects if not is_homepage_url(url): urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])} distinct_urls = list(set(urls.values())) topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls) distinct_urls = distinct_urls + topic_urls distinct_urls = list(set(distinct_urls)) # Remove URLs that can't be variants of the initial URL for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES: distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)] return distinct_urls
def page(self) -> Optional[SitemapPage]: """Return constructed sitemap page if one has been completed, otherwise None.""" # Required url = html_unescape_strip(self.url) if not url: log.error("URL is unset") return None try: url = normalize_url(url) except Exception as ex: log.error("Unable to normalize URL {}: {}".format(url, ex)) return None last_modified = html_unescape_strip(self.last_modified) if last_modified: last_modified = parse_sitemap_publication_date(last_modified) change_frequency = html_unescape_strip(self.change_frequency) if change_frequency: change_frequency = SitemapPageChangeFrequency(change_frequency.lower()) assert isinstance(change_frequency, SitemapPageChangeFrequency) priority = html_unescape_strip(self.priority) if priority: priority = Decimal(priority) comp_zero = priority.compare(Decimal('0.0')) comp_one = priority.compare(Decimal('1.0')) if comp_zero in (Decimal('0'), Decimal('1') and comp_one in (Decimal('0'), Decimal('-1'))): # 0 <= priority <= 1 pass else: log.warning("Priority is not within 0 and 1: {}".format(priority)) priority = SITEMAP_PAGE_DEFAULT_PRIORITY else: priority = SITEMAP_PAGE_DEFAULT_PRIORITY news_title = html_unescape_strip(self.news_title) news_publish_date = html_unescape_strip(self.news_publish_date) if news_publish_date: news_publish_date = parse_sitemap_publication_date(date_string=news_publish_date) news_publication_name = html_unescape_strip(self.news_publication_name) news_publication_language = html_unescape_strip(self.news_publication_language) news_access = html_unescape_strip(self.news_access) news_genres = html_unescape_strip(self.news_genres) if news_genres: news_genres = [x.strip() for x in news_genres.split(',')] else: news_genres = [] news_keywords = html_unescape_strip(self.news_keywords) if news_keywords: news_keywords = [x.strip() for x in news_keywords.split(',')] else: news_keywords = [] news_stock_tickers = html_unescape_strip(self.news_stock_tickers) if news_stock_tickers: news_stock_tickers = [x.strip() for x in news_stock_tickers.split(',')] else: news_stock_tickers = [] sitemap_news_story = None if news_title and news_publish_date: sitemap_news_story = SitemapNewsStory( title=news_title, publish_date=news_publish_date, publication_name=news_publication_name, publication_language=news_publication_language, access=news_access, genres=news_genres, keywords=news_keywords, stock_tickers=news_stock_tickers, ) return SitemapPage( url=url, last_modified=last_modified, change_frequency=change_frequency, priority=priority, news_story=sitemap_news_story, )
def __normalize_media_url(output_dir: str, media_id: str, url: str, queue: multiprocessing.Queue) -> None: if is_http_url(url): normalized_url = normalize_url(url) output_file = os.path.join(output_dir, media_id) queue.put((output_file, normalized_url,))