Пример #1
0
    def get_follow_http_html_redirects(self, url: str) -> Response:
        """GET an URL while resolving HTTP / HTML redirects."""

        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetFollowHTTPHTMLRedirectsException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetFollowHTTPHTMLRedirectsException("URL is not HTTP(s): %s" % url)

        if self.max_redirect() == 0:
            raise McGetFollowHTTPHTMLRedirectsException(
                "User agent's max_redirect is 0, subroutine might loop indefinitely."
            )

        response = self.get(url)

        response_after_redirects = self.__get_follow_http_html_redirects(
            response_=response,
            meta_redirects_left=self.max_redirect()
        )
        if response_after_redirects is None:
            # One of the redirects failed -- return original response
            return response

        else:
            return response_after_redirects
Пример #2
0
    def __init__(self,
                 url: str,
                 recursion_level: int,
                 ua: Optional[UserAgent] = None):

        if recursion_level > self.__MAX_RECURSION_LEVEL:
            raise McSitemapsException(
                "Recursion level exceeded {} for URL {}.".format(
                    self.__MAX_RECURSION_LEVEL, url))

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McSitemapsException(
                "URL {} is not a HTTP(s) URL.".format(url))

        try:
            url = normalize_url(url)
        except Exception as ex:
            raise McSitemapsException("Unable to normalize URL {}: {}".format(
                url, ex))

        if not ua:
            ua = sitemap_useragent()

        self._url = url
        self._ua = ua
        self._recursion_level = recursion_level
Пример #3
0
def sitemap_tree_for_homepage(homepage_url: str) -> AbstractSitemap:
    """Using a homepage URL, fetch the tree of sitemaps and its stories."""

    if not is_http_url(homepage_url):
        raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(homepage_url))

    try:
        url = normalize_url(homepage_url)
    except Exception as ex:
        raise McSitemapsException("Unable to normalize URL {}: {}".format(homepage_url, ex))

    try:
        uri = furl(url)
    except Exception as ex:
        raise McSitemapsException("Unable to parse URL {}: {}".format(url, ex))

    if not is_homepage_url(homepage_url):
        try:
            uri = uri.remove(path=True, query=True, query_params=True, fragment=True)
            log.warning("Assuming that the homepage of {} is {}".format(homepage_url, uri.url))
        except Exception as ex:
            raise McSitemapsException("Unable to determine homepage URL for URL {}: {}".format(homepage_url, ex))

    uri.path = '/robots.txt'
    robots_txt_url = str(uri.url)

    robots_txt_fetcher = SitemapFetcher(url=robots_txt_url, recursion_level=0)
    sitemap_tree = robots_txt_fetcher.sitemap()
    return sitemap_tree
Пример #4
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        try:
            db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)
        except McTupleAlreadyMovedError as ex:
            # Some attempts to set the download's row to "fetching" fail with:
            #
            #   "tuple to be locked was already moved to another partition due to concurrent update"
            #
            # If that happens, we assume that some other fetcher instance somehow got to the download first and do
            # nothing
            log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}")
            return None
        except Exception as ex:
            # Raise further on misc. errors
            raise ex

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
Пример #5
0
 def url_path_has_mp3_extension(self) -> bool:
     """Return True if URL's path has .mp3 extension."""
     if is_http_url(self.url):
         uri = furl(self.url)
         if '.mp3' in str(uri.path).lower():
             return True
     return False
Пример #6
0
def __get_meta_refresh_url_from_tag(inner_tag: str, inner_base_url: Optional[str] = None) -> Optional[str]:
    """Given a <meta ...> tag, return the url from the content="url=XXX" attribute.

    Return None if no such url isfound.
    """
    if not re.search(r'http-equiv\s*?=\s*?["\']\s*?refresh\s*?["\']', inner_tag, re.I):
        return None

    # content="url='http://foo.bar'"
    inner_url = None

    match = re.search(r'content\s*?=\s*?"\d*?\s*?;?\s*?URL\s*?=\s*?\'(.+?)\'', inner_tag, re.I)
    if match:
        inner_url = match.group(1)
    else:
        # content="url='http://foo.bar'"
        match = re.search(r'content\s*?=\s*?\'\d*?\s*?;?\s*?URL\s*?=\s*?"(.+?)"', inner_tag, re.I)
        if match:
            inner_url = match.group(1)
        else:
            # Fallback
            match = re.search(r'content\s*?=\s*?["\']\d*?\s*?;?\s*?URL\s*?=\s*?(.+?)["\']', inner_tag, re.I)
            if match:
                inner_url = match.group(1)

    if inner_url is None:
        return None

    if is_http_url(inner_url):
        return inner_url

    if inner_base_url is not None:
        return urljoin(base=str(inner_base_url), url=inner_url)

    return None
def get_links_from_html(html: str) -> typing.List[str]:
    """Return a list of all links that appear in the html.

    Only return absolute urls, because we would rather get fewer internal media source links.  Also include embedded
    youtube video urls.

    Arguments:
    html - html to parse

    Returns:
    list of string urls

    """
    soup = BeautifulSoup(html, 'lxml')

    links = []

    # get everything with an href= element rather than just <a /> links
    for tag in soup.find_all(href=True):
        url = tag['href']

        if re.search(IGNORE_LINK_PATTERN, url, flags=re.I) is not None:
            continue

        if not is_http_url(url):
            continue

        url = re.sub(r'(https)?://www[a-z0-9]+.nytimes', r'\1://www.nytimes', url, flags=re.I)

        links.append(url)

    return links
Пример #8
0
    def get_follow_http_html_redirects(self, url: str) -> Response:
        """GET an URL while resolving HTTP / HTML redirects."""

        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetFollowHTTPHTMLRedirectsException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetFollowHTTPHTMLRedirectsException(
                "URL is not HTTP(s): %s" % url)

        if self.max_redirect() == 0:
            raise McGetFollowHTTPHTMLRedirectsException(
                "User agent's max_redirect is 0, subroutine might loop indefinitely."
            )

        response = self.get(url)

        response_after_redirects = self.__get_follow_http_html_redirects(
            response_=response, meta_redirects_left=self.max_redirect())
        if response_after_redirects is None:
            # One of the redirects failed -- return original response
            return response

        else:
            return response_after_redirects
Пример #9
0
def link_canonical_url_from_html(html: str,
                                 base_url: str = None) -> Optional[str]:
    """From the provided HTML, determine the <link rel="canonical" /> URL (if any)."""
    html = decode_object_from_bytes_if_needed(html)
    base_url = decode_object_from_bytes_if_needed(base_url)

    link_elements = re.findall(r'(<\s*?link.+?>)', html, re.I)
    for link_element in link_elements:
        if re.search(r'rel\s*?=\s*?["\']\s*?canonical\s*?["\']', link_element,
                     re.I):
            url = re.search(r'href\s*?=\s*?["\'](.+?)["\']', link_element,
                            re.I)
            if url:
                url = url.group(1)
                if not is_http_url(url):
                    # Maybe it's absolute path?
                    if base_url is not None:
                        return urljoin(base=base_url, url=url)
                    else:
                        log.debug(
                            "HTML <link rel='canonical'/> found, but the new URL '%s' doesn't seem to be valid."
                            % url)
                else:
                    # Looks like URL, so return it
                    return url
    return None
Пример #10
0
    def sitemap(self) -> AbstractSitemap:

        # Serves as an ordered set because we want to deduplicate URLs but
        sitemap_urls = OrderedDict()

        for robots_txt_line in self._content.splitlines():
            robots_txt_line = robots_txt_line.strip()
            # robots.txt is supposed to be case sensitive but who cares in these Node.js times?
            robots_txt_line = robots_txt_line.lower()
            sitemap_match = re.search(r'^sitemap: (.+?)$', robots_txt_line, flags=re.IGNORECASE)
            if sitemap_match:
                sitemap_url = sitemap_match.group(1)
                if is_http_url(sitemap_url):
                    sitemap_urls[sitemap_url] = True
                else:
                    log.debug("Sitemap URL {} doesn't look like an URL, skipping".format(sitemap_url))

        sub_sitemaps = []

        for sitemap_url in sitemap_urls.keys():
            fetcher = SitemapFetcher(url=sitemap_url, recursion_level=self._recursion_level, ua=self._ua)
            fetched_sitemap = fetcher.sitemap()
            sub_sitemaps.append(fetched_sitemap)

        # noinspection PyArgumentList
        index_sitemap = IndexRobotsTxtSitemap(url=self._url, sub_sitemaps=sub_sitemaps)

        return index_sitemap
Пример #11
0
def meta_refresh_url_from_html(html: str,
                               base_url: Optional[str] = None
                               ) -> Optional[str]:
    """From the provided HTML, determine the <meta http-equiv="refresh" /> URL (if any)."""
    def __get_meta_refresh_url_from_tag(
            inner_tag: str,
            inner_base_url: Optional[str] = None) -> Optional[str]:
        """Given a <meta ...> tag, return the url from the content="url=XXX" attribute.

        return undef if no such url isfound.
        """
        if not re.search(r'http-equiv\s*?=\s*?["\']\s*?refresh\s*?["\']',
                         inner_tag, re.I):
            return None

        # content="url='http://foo.bar'"
        inner_url = None

        match = re.search(
            r'content\s*?=\s*?"\d*?\s*?;?\s*?URL\s*?=\s*?\'(.+?)\'', inner_tag,
            re.I)
        if match:
            inner_url = str(match.group(1))
        else:
            # content="url='http://foo.bar'"
            match = re.search(
                r'content\s*?=\s*?\'\d*?\s*?;?\s*?URL\s*?=\s*?"(.+?)"',
                inner_tag, re.I)
            if match:
                inner_url = str(match.group(1))
            else:
                # Fallback
                match = re.search(
                    r'content\s*?=\s*?["\']\d*?\s*?;?\s*?URL\s*?=\s*?(.+?)["\']',
                    inner_tag, re.I)
                if match:
                    inner_url = str(match.group(1))

        if is_http_url(str(inner_url)):
            return inner_url

        if inner_base_url is not None:
            return urljoin(base=str(inner_base_url), url=str(inner_url))

        return None

    html = str(decode_object_from_bytes_if_needed(html))
    base_url_decode = decode_object_from_bytes_if_needed(base_url)
    base_url = None if base_url_decode is None else str(base_url_decode)

    if not is_http_url(str(base_url)):
        log.info("Base URL is not HTTP(s): %s" % base_url)

    tags = re.findall(r'(<\s*meta[^>]+>)', html, re.I)
    for tag in tags:
        url = __get_meta_refresh_url_from_tag(tag, base_url)
        if url is not None:
            return url

    return None
Пример #12
0
def link_canonical_url_from_html(html: str, base_url: Optional[str] = None) -> Optional[str]:
    """From the provided HTML, determine the <link rel="canonical" /> URL (if any)."""
    html = str(decode_object_from_bytes_if_needed(html))

    base_url_decode = decode_object_from_bytes_if_needed(base_url)
    base_url = None if base_url_decode is None else str(base_url_decode)

    link_elements = re.findall(r'(<\s*?link.+?>)', html, re.I)
    for link_element in link_elements:
        if re.search(r'rel\s*?=\s*?["\']\s*?canonical\s*?["\']', link_element, re.I):
            match = re.search(r'href\s*?=\s*?["\'](.+?)["\']', link_element, re.I)
            if match:
                url = match.group(1)
                if not is_http_url(url):
                    # Maybe it's absolute path?
                    if base_url is not None:
                        return urljoin(base=base_url, url=url)
                    else:
                        log.debug(
                            "HTML <link rel='canonical'/> found, but the new URL '%s' doesn't seem to be valid." % url
                        )
                else:
                    # Looks like URL, so return it
                    return url
    return None
Пример #13
0
def _get_links_from_html(html: str) -> List[str]:
    """Return a list of all links that appear in the html.

    Only return absolute urls, because we would rather get fewer internal media source links.  Also include embedded
    youtube video urls.

    Arguments:
    html - html to parse

    Returns:
    list of string urls

    """
    soup = BeautifulSoup(html, 'lxml')

    links = []

    # get everything with an href= element rather than just <a /> links
    for tag in soup.find_all(href=True):
        url = tag['href']

        if re.search(IGNORE_LINK_PATTERN, url, flags=re.I) is not None:
            continue

        if not is_http_url(url):
            continue

        url = re.sub(r'(https)?://www[a-z0-9]+.nytimes',
                     r'\1://www.nytimes',
                     url,
                     flags=re.I)

        links.append(url)

    return links
Пример #14
0
    def sitemap(self) -> AbstractSitemap:

        # Serves as an ordered set because we want to deduplicate URLs but
        sitemap_urls = OrderedDict()

        for robots_txt_line in self._content.splitlines():
            robots_txt_line = robots_txt_line.strip()
            # robots.txt is supposed to be case sensitive but who cares in these Node.js times?
            robots_txt_line = robots_txt_line.lower()
            sitemap_match = re.search(r'^sitemap: (.+?)$',
                                      robots_txt_line,
                                      flags=re.IGNORECASE)
            if sitemap_match:
                sitemap_url = sitemap_match.group(1)
                if is_http_url(sitemap_url):
                    sitemap_urls[sitemap_url] = True
                else:
                    log.debug(
                        "Sitemap URL {} doesn't look like an URL, skipping".
                        format(sitemap_url))

        sub_sitemaps = []

        for sitemap_url in sitemap_urls.keys():
            fetcher = SitemapFetcher(url=sitemap_url,
                                     recursion_level=self._recursion_level,
                                     ua=self._ua)
            fetched_sitemap = fetcher.sitemap()
            sub_sitemaps.append(fetched_sitemap)

        # noinspection PyArgumentList
        index_sitemap = IndexRobotsTxtSitemap(url=self._url,
                                              sub_sitemaps=sub_sitemaps)

        return index_sitemap
def test_get_links_from_html():
    def test_links(html_: str, links_: list) -> None:
        assert _get_links_from_html(html_) == links_

    test_links('<a href="http://foo.com">', ['http://foo.com'])
    test_links('<link href="http://bar.com">', ['http://bar.com'])
    test_links('<img src="http://img.tag">', [])

    test_links('<a href="http://foo.com"/> <a href="http://bar.com"/>',
               ['http://foo.com', 'http://bar.com'])

    # transform nyt urls
    test_links('<a href="http://www3.nytimes.com/foo/bar">',
               ['http://www.nytimes.com/foo/bar'])

    # ignore relative urls
    test_links('<a href="/foo/bar">', [])

    # ignore invalid urls
    test_links(r'<a href="http:\\foo.bar">', [])

    # ignore urls from ignore pattern
    test_links('<a href="http://www.addtoany.com/http://foo.bar">', [])
    test_links(
        '<a href="https://en.unionpedia.org/c/SOE_F_Section_timeline/vs/Special_Operations_Executive">',
        [])
    test_links('<a href="http://digg.com/submit/this">', [])
    test_links('<a href="http://politicalgraveyard.com/>', [])
    test_links(
        '<a href="http://api.bleacherreport.com/api/v1/tags/cm-punk.json">',
        [])
    test_links('<a href="http://apidomain.com">', ['http://apidomain.com'])
    test_links(
        '<a href="http://www.rumormillnews.com/cgi-bin/forum.cgi?noframes;read=54990">',
        [])
    test_links(
        '<a href="http://tvtropes.org/pmwiki/pmwiki.php/Main/ClockTower">', [])
    test_links('<a href="https://twitter.com/account/suspended">', [])
    test_links('<a href="https://misuse.ncbi.nlm.nih.gov/error/abuse.shtml">',
               [])
    test_links(
        '<a href="https://assets.feedblitzstatic.com/images/blank.gif">', [])
    test_links('<a href="https://accounts.google.com/ServiceLogin">', [])
    test_links(
        '<a href="http://network.wwe.com/video/v92665683/milestone/526767283">',
        [])

    # sanity test to make sure that we are able to get all of the links from a real html page
    filename = '/opt/mediacloud/tests/data/html-strip/strip.html'
    with open(filename, 'r', encoding='utf8') as fh:
        html = fh.read()

    links = _get_links_from_html(html)
    assert len(links) == 310
    for link in links:
        assert is_http_url(link)
Пример #16
0
def _fetch_url(db: DatabaseHandler,
               url: str,
               network_down_host: str = DEFAULT_NETWORK_DOWN_HOST,
               network_down_port: int = DEFAULT_NETWORK_DOWN_PORT,
               network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT,
               domain_timeout: Optional[int] = None) -> FetchLinkResponse:
    """Fetch a url and return the content.

    If fetching the url results in a 400 error, check whether the network_down_host is accessible.  If so,
    return the errored response.  Otherwise, wait network_down_timeout seconds and try again.

    This function catches McGetException and returns a dummy 400 Response object.

    Arguments:
    db - db handle
    url - url to fetch
    network_down_host - host to check if network is down on error
    network_down_port - port to check if network is down on error
    network_down_timeout - seconds to wait if the network is down
    domain_timeout - value to pass to ThrottledUserAgent()

    Returns:
    Response object
    """
    if url_has_binary_extension(url):
        return _make_dummy_bypassed_response(url)

    while True:
        ua = ThrottledUserAgent(db, domain_timeout=domain_timeout)

        if is_http_url(url):
            ua_response = ua.get_follow_http_html_redirects(url)
            response = FetchLinkResponse.from_useragent_response(
                url, ua_response)
        else:
            log.warning(f"URL is not HTTP(s), returning dummy response: {url}")
            response = FetchLinkResponse(
                url=url,
                is_success=False,
                code=HTTPStatus.BAD_REQUEST.value,
                message=HTTPStatus.BAD_REQUEST.phrase,
                content='bad url',
                last_requested_url=None,
            )

        if response.is_success:
            return response

        if response.code == HTTPStatus.BAD_REQUEST.value and not tcp_port_is_open(
                port=network_down_port, hostname=network_down_host):
            log.warning(
                "Response failed with %s and network is down.  Waiting to retry ..."
                % (url, ))
            time.sleep(network_down_timeout)
        else:
            return response
Пример #17
0
    def xml_element_end(self, name: str) -> None:

        if name == 'sitemap:loc':
            sub_sitemap_url = html_unescape_strip(self._last_char_data)
            if not is_http_url(sub_sitemap_url):
                log.warning("Sub-sitemap URL does not look like one: {}".format(sub_sitemap_url))

            else:
                if sub_sitemap_url not in self._sub_sitemap_urls:
                    self._sub_sitemap_urls.append(sub_sitemap_url)

        super().xml_element_end(name=name)
Пример #18
0
def target_request_from_alarabiya_url(
        content: str, archive_site_url: str) -> Union[Request, None]:
    """alarabiya uses an interstitial that requires JavaScript. If the download URL matches alarabiya and returns the
    'requires JavaScript' page, manually parse out the necessary cookie and add it to the $ua so that the request will
    work."""

    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    if not is_http_url(archive_site_url):
        log.error("Archive site URL is not HTTP(s): %s" % archive_site_url)
        return None

    if content is None:
        return None

    if not re.search(
            pattern='alarabiya', string=archive_site_url, flags=re.IGNORECASE):
        return None

    if not re.search(
            pattern='This site requires JavaScript and Cookies to be enabled',
            string=content,
            flags=re.IGNORECASE):
        return None

    matches = re.search(
        pattern=
        r"setCookie\('(?P<cookie_name>[^']+)', '(?P<cookie_value>[^']+)'",
        string=content,
        flags=re.IGNORECASE)
    if matches:
        cookie_name = matches.group('cookie_name')
        cookie_value = matches.group('cookie_value')

        request = Request(method='GET', url=archive_site_url)
        request.set_header(name='Cookie',
                           value="%s=%s" % (
                               cookie_name,
                               cookie_value,
                           ))
        return request

    else:
        log.warning("Unable to parse cookie from alarabiya URL %s: %s" % (
            archive_site_url,
            content,
        ))

    return None
Пример #19
0
    def xml_element_end(self, name: str) -> None:

        if name == 'sitemap:loc':
            sub_sitemap_url = html_unescape_strip(self._last_char_data)
            if not is_http_url(sub_sitemap_url):
                log.warning(
                    "Sub-sitemap URL does not look like one: {}".format(
                        sub_sitemap_url))

            else:
                if sub_sitemap_url not in self._sub_sitemap_urls:
                    self._sub_sitemap_urls.append(sub_sitemap_url)

        super().xml_element_end(name=name)
Пример #20
0
    def set_url(self, url: str) -> None:
        """Set URL, e.g. https://www.mediacloud.org/page.html"""
        url = decode_object_from_bytes_if_needed(url)
        if url is None:
            raise McUserAgentRequestException("URL is None.")
        if len(url) == 0:
            raise McUserAgentRequestException("URL is empty.")

        # Might be coming from "requests" which managed to fetch a bogus URL but we deem it to be invalid
        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McUserAgentRequestException("URL is not HTTP(s): %s" % str(url))

        self.__url = url
Пример #21
0
def meta_refresh_url_from_html(html: str, base_url: Optional[str] = None) -> Optional[str]:
    """From the provided HTML, determine the <meta http-equiv="refresh" /> URL (if any)."""

    html = str(decode_object_from_bytes_if_needed(html))
    base_url_decode = decode_object_from_bytes_if_needed(base_url)
    base_url = None if base_url_decode is None else str(base_url_decode)

    if not is_http_url(str(base_url)):
        log.info("Base URL is not HTTP(s): %s" % base_url)

    tags = re.findall(r'(<\s*meta[^>]+>)', html, re.I)
    for tag in tags:
        url = __get_meta_refresh_url_from_tag(tag, base_url)
        if url is not None:
            return url

    return None
Пример #22
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads',
                        object_id=download['downloads_id'],
                        update_hash=download)

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
Пример #23
0
    def get(self, url: str) -> Response:
        """GET an URL."""
        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetException("URL is not HTTP(s): %s" % url)

        # Add HTTP authentication
        url = self.__url_with_http_auth(url=url)

        request = Request(method='GET', url=url)

        return self.request(request)
Пример #24
0
    def __get_meta_refresh_url_from_tag(
            inner_tag: str,
            inner_base_url: Optional[str] = None) -> Optional[str]:
        """Given a <meta ...> tag, return the url from the content="url=XXX" attribute.

        Return None if no such url isfound.
        """
        if not re.search(r'http-equiv\s*?=\s*?["\']\s*?refresh\s*?["\']',
                         inner_tag, re.I):
            return None

        # content="url='http://foo.bar'"
        inner_url = None

        match = re.search(
            r'content\s*?=\s*?"\d*?\s*?;?\s*?URL\s*?=\s*?\'(.+?)\'', inner_tag,
            re.I)
        if match:
            inner_url = match.group(1)
        else:
            # content="url='http://foo.bar'"
            match = re.search(
                r'content\s*?=\s*?\'\d*?\s*?;?\s*?URL\s*?=\s*?"(.+?)"',
                inner_tag, re.I)
            if match:
                inner_url = match.group(1)
            else:
                # Fallback
                match = re.search(
                    r'content\s*?=\s*?["\']\d*?\s*?;?\s*?URL\s*?=\s*?(.+?)["\']',
                    inner_tag, re.I)
                if match:
                    inner_url = match.group(1)

        if inner_url is None:
            return None

        if is_http_url(inner_url):
            return inner_url

        if inner_base_url is not None:
            return urljoin(base=str(inner_base_url), url=inner_url)

        return None
Пример #25
0
    def get(self, url: str) -> Response:
        """GET an URL."""
        log.debug("mediawords.util.web.user_agent.get: %s" % url)
        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetException("URL is not HTTP(s): %s" % url)

        # Add HTTP authentication
        url = self.__url_with_http_auth(url=url)

        request = Request(method='GET', url=url)

        return self.request(request)
Пример #26
0
def meta_refresh_url_from_html(html: str,
                               base_url: Optional[str] = None
                               ) -> Optional[str]:
    """From the provided HTML, determine the <meta http-equiv="refresh" /> URL (if any)."""

    html = str(decode_object_from_bytes_if_needed(html))
    base_url_decode = decode_object_from_bytes_if_needed(base_url)
    base_url = None if base_url_decode is None else str(base_url_decode)

    if not is_http_url(str(base_url)):
        log.info("Base URL is not HTTP(s): %s" % base_url)

    tags = re.findall(r'(<\s*meta[^>]+>)', html, re.I)
    for tag in tags:
        url = __get_meta_refresh_url_from_tag(tag, base_url)
        if url is not None:
            return url

    return None
Пример #27
0
def target_request_from_meta_refresh_url(
        content: str, archive_site_url: str) -> Union[Request, None]:
    """Given a URL and content from website with META refresh, return a request for the original URL."""

    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    if content is None:
        return None

    target_url = meta_refresh_url_from_html(html=content,
                                            base_url=archive_site_url)
    if target_url is None:
        return None

    if not is_http_url(target_url):
        log.error("URL matched, but is not HTTP(s): %s" % target_url)
        return None

    return Request(method='GET', url=target_url)
Пример #28
0
def target_request_from_archive_is_url(
        content: str, archive_site_url: str) -> Union[Request, None]:
    """Given a URL and content from archive.is, return a request for the original URL."""
    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    if content is None:
        return None

    if re.match(pattern=r'^https?://archive\.is/(.+?)$',
                string=archive_site_url,
                flags=re.IGNORECASE):
        canonical_link = link_canonical_url_from_html(html=content)
        if canonical_link is not None:
            matches = re.match(
                pattern=
                r'^https?://archive\.is/\d+?/(?P<target_url>https?://.+?)$',
                string=canonical_link,
                flags=re.IGNORECASE)
            if matches:
                target_url = matches.group('target_url')

                if is_http_url(target_url):
                    return Request(method='GET', url=target_url)
                else:
                    log.error("URL matched, but is not HTTP(s): %s" %
                              target_url)

            else:
                log.error(
                    "Unable to parse original URL from archive.is response '%s': %s"
                    % (
                        archive_site_url,
                        canonical_link,
                    ))
        else:
            log.error(
                "Unable to parse original URL from archive.is response '%s'" %
                archive_site_url)

    return None
Пример #29
0
    def __init__(self, url: str, recursion_level: int, ua: Optional[UserAgent] = None):

        if recursion_level > self.__MAX_RECURSION_LEVEL:
            raise McSitemapsException("Recursion level exceeded {} for URL {}.".format(self.__MAX_RECURSION_LEVEL, url))

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McSitemapsException("URL {} is not a HTTP(s) URL.".format(url))

        try:
            url = normalize_url(url)
        except Exception as ex:
            raise McSitemapsException("Unable to normalize URL {}: {}".format(url, ex))

        if not ua:
            ua = sitemap_useragent()

        self._url = url
        self._ua = ua
        self._recursion_level = recursion_level
Пример #30
0
def target_request_from_archive_org_url(
        content: Union[str,
                       None], archive_site_url: str) -> Union[Request, None]:
    """Given a URL and content from archive.org, return a request for the original URL."""

    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    matches = re.match(
        pattern=
        r'^https?://web\.archive\.org/web/(?P<date>\d+?/)?(?P<target_url>https?://.+?)$',
        string=archive_site_url,
        flags=re.IGNORECASE)
    if matches:
        target_url = matches.group('target_url')

        if is_http_url(target_url):
            return Request(method='GET', url=target_url)
        else:
            log.error("URL matched, but is not HTTP(s): %s" % target_url)

    return None
Пример #31
0
    def __get_url_domain(url_: str) -> str:

        if not is_http_url(url_):
            return url_

        host = get_url_host(url_)

        name_parts = host.split('.')

        n = len(name_parts) - 1

        # for country domains, use last three parts of name
        if re.search(pattern=r"\...$", string=host):
            domain = '.'.join([name_parts[n - 2], name_parts[n - 1], name_parts[0]])

        elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)", string=host):
            domain = url_

        else:
            domain = '.'.join([name_parts[n - 1], name_parts[n]])

        return domain.lower()
Пример #32
0
    def sitemap(self) -> AbstractSitemap:

        story_urls = OrderedDict()

        for story_url in self._content.splitlines():
            story_url = story_url.strip()
            if not story_url:
                continue
            if is_http_url(story_url):
                story_urls[story_url] = True
            else:
                log.warning("Story URL {} doesn't look like an URL, skipping".format(story_url))

        pages = []
        for page_url in story_urls.keys():
            page = SitemapPage(url=page_url)
            pages.append(page)

        # noinspection PyArgumentList
        text_sitemap = PagesTextSitemap(url=self._url, pages=pages)

        return text_sitemap
Пример #33
0
    def __get_url_domain(url_: str) -> str:

        if not is_http_url(url_):
            return url_

        host = get_url_host(url_)

        name_parts = host.split('.')

        n = len(name_parts) - 1

        # for country domains, use last three parts of name
        if re.search(pattern=r"\...$", string=host):
            domain = '.'.join([name_parts[n - 2], name_parts[n - 1], name_parts[0]])

        elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)", string=host):
            domain = url_

        else:
            domain = '.'.join([name_parts[n - 1], name_parts[n]])

        return domain.lower()
Пример #34
0
    def sitemap(self) -> AbstractSitemap:

        story_urls = OrderedDict()

        for story_url in self._content.splitlines():
            story_url = story_url.strip()
            if not story_url:
                continue
            if is_http_url(story_url):
                story_urls[story_url] = True
            else:
                log.warning(
                    "Story URL {} doesn't look like an URL, skipping".format(
                        story_url))

        pages = []
        for page_url in story_urls.keys():
            page = SitemapPage(url=page_url)
            pages.append(page)

        # noinspection PyArgumentList
        text_sitemap = PagesTextSitemap(url=self._url, pages=pages)

        return text_sitemap
Пример #35
0
    def parallel_get(urls: List[str]) -> List[Response]:
        """GET multiple URLs in parallel."""

        # FIXME doesn't respect timing() and other object properties

        urls = decode_object_from_bytes_if_needed(urls)

        # Original implementation didn't raise on undefined / empty list of URLs
        if urls is None:
            return []
        if len(urls) == 0:
            return []

        # Remove duplicates from list while maintaining order because:
        # 1) We don't want to fetch the same URL twice
        # 2) URLs are being used as unique dictionary IDs later on
        urls_before_removing_duplicates = urls.copy()
        urls = list(OrderedDict.fromkeys(urls))
        if len(urls) != len(urls_before_removing_duplicates):
            log.warning("Some of the URLs are duplicate; URLs: %s" % str(urls_before_removing_duplicates))

        # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid,
        # get() in a fork should be able to come up with a reasonable Response object for it
        for url in urls:
            if not is_http_url(url):
                raise McParallelGetException("URL %s is not a valid URL; URLs: %s" % (url, str(urls),))

        config = py_get_config()

        if 'web_store_num_parallel' not in config['mediawords']:
            raise McParallelGetException('"web_store_num_parallel" is not set.')
        num_parallel = config['mediawords']['web_store_num_parallel']

        if 'web_store_timeout' not in config['mediawords']:
            raise McParallelGetException('"web_store_timeout" is not set.')
        timeout = config['mediawords']['web_store_timeout']

        if 'web_store_per_domain_timeout' not in config['mediawords']:
            raise McParallelGetException('"web_store_per_domain_timeout" is not set.')
        per_domain_timeout = config['mediawords']['web_store_per_domain_timeout']

        url_stack = UserAgent.__get_scheduled_urls(urls_=urls, per_domain_timeout_=per_domain_timeout)

        start_time = time.time()

        url_blocks = {}
        while len(url_stack) > 0:
            block_i = len(url_stack) % num_parallel

            if block_i not in url_blocks:
                url_blocks[block_i] = []

            url_blocks[block_i].append(url_stack.pop())

        pool = multiprocessing.Pool(processes=num_parallel)

        all_results = []
        for i, url_block in url_blocks.items():
            result = pool.apply_async(_parallel_get_web_store, args=(url_block, start_time, timeout,))
            all_results.append(result)

        all_responses = []
        for result in all_results:
            responses = result.get()
            all_responses = all_responses + responses

        # No timeouts here because we trust the workers to timeout by themselves (by UserAgent)
        pool.close()
        pool.join()
        pool.terminate()

        # Sort URLs in parameter order
        # (if URLs weren't split into blocks, we could probably use map_async)
        response_url_map = {}
        for response in all_responses:
            url = response.scheduled_url.url
            response_url_map[url] = response.response

        sorted_responses = []
        for url in urls:
            if url not in response_url_map:
                raise McParallelGetException("URL %s is not in the response URL map %s." % (url, response_url_map,))

            sorted_responses.append(response_url_map[url])

        if len(urls) != len(sorted_responses):
            raise McParallelGetException(
                "Response count doesn't match URL count; responses: %s; URLs: %s" % (sorted_responses, urls,)
            )

        return sorted_responses
Пример #36
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(
            html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(
                url_link_rel_canonical) > 0:
            log.debug(
                ('Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                 '(original URL: %(url)s): %(url_link_rel_canonical)s') % {
                     "url_after_redirects": url_after_redirects,
                     "url": url,
                     "url_link_rel_canonical": url_link_rel_canonical,
                 })

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {
            key: urls[key]
            for key in urls.keys() if not is_homepage_url(urls[key])
        }

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [
            x for x in distinct_urls
            if not re.search(pattern=invalid_url_variant_regex, string=x)
        ]

    return distinct_urls
Пример #37
0
def test_is_http_url():
    # noinspection PyTypeChecker
    assert not mc_url.is_http_url(None)
    assert not mc_url.is_http_url('')

    assert not mc_url.is_http_url('abc')
    assert not mc_url.is_http_url('/abc')
    assert not mc_url.is_http_url('//abc')
    assert not mc_url.is_http_url('///abc')

    assert not mc_url.is_http_url('gopher://gopher.floodgap.com/0/v2/vstat')
    assert not mc_url.is_http_url('ftp://ftp.freebsd.org/pub/FreeBSD/')

    assert mc_url.is_http_url('http://cyber.law.harvard.edu/about')
    assert mc_url.is_http_url('https://github.com/berkmancenter/mediacloud')

    funky_url = ('http://Las%20Vegas%20mass%20shooting%20raises'
                 '%20new%20doubts%20about%20safety%20of%20live%20entertainment')
    assert mc_url.is_http_url(funky_url) is False

    # URLs with port, HTTP auth, localhost
    assert mc_url.is_http_url('https://*****:*****@domain.com:12345/path?query=string#fragment')
    assert mc_url.is_http_url('http://*****:*****@') is False
Пример #38
0
def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> Optional[StoryEnclosure]:
    """Fetch all enclosures, find and return the one that looks like a podcast episode the most (or None)."""
    story_enclosures_dicts = db.query("""
        SELECT *
        FROM story_enclosures
        WHERE stories_id = %(stories_id)s

        -- Returning by insertion order so the enclosures listed earlier will have a better chance of being considered
        -- episodes  
        ORDER BY story_enclosures_id
    """, {
        'stories_id': stories_id,
    }).hashes()

    if not story_enclosures_dicts:
        log.warning(f"Story {stories_id} has no enclosures to choose from.")
        return None

    story_enclosures = []

    for enclosure_dict in story_enclosures_dicts:
        if is_http_url(enclosure_dict['url']):
            story_enclosures.append(StoryEnclosure.from_db_row(db_row=enclosure_dict))

    chosen_enclosure = None

    # Look for MP3 files in MIME type
    for enclosure in story_enclosures:
        if enclosure.mime_type_is_mp3():
            log.info(f"Choosing enclosure '{enclosure}' by its MP3 MIME type '{enclosure.mime_type}'")
            chosen_enclosure = enclosure
            break

    # If that didn't work, look into URL's path
    if not chosen_enclosure:
        for enclosure in story_enclosures:
            if enclosure.url_path_has_mp3_extension():
                log.info(f"Choosing enclosure '{enclosure}' by its URL '{enclosure.url}'")
                chosen_enclosure = enclosure
                break

    # If there are no MP3s in sight, try to find any kind of audio enclosure because it's a smaller download than video
    # and faster to transcode
    if not chosen_enclosure:
        for enclosure in story_enclosures:
            if enclosure.mime_type_is_audio():
                log.info(f"Choosing enclosure '{enclosure}' by its audio MIME type '{enclosure.mime_type}'")
                chosen_enclosure = enclosure
                break

    # In case there are no audio enclosures, look for videos then
    if not chosen_enclosure:
        for enclosure in story_enclosures:
            if enclosure.mime_type_is_video():
                log.info(f"Choosing enclosure '{enclosure}' by its video MIME type '{enclosure.mime_type}'")
                chosen_enclosure = enclosure
                break

    # Return either the best option that we've found so far, or None if there were no (explicitly declared)
    # audio / video enclosures
    return chosen_enclosure
Пример #39
0
    def _api_request_url_with_signature(cls,
                                        api_url: str,
                                        client_id: str,
                                        client_secret: str,
                                        http_method: str = 'GET') -> str:
        """Return API URL with request signature appended."""

        api_url = decode_object_from_bytes_if_needed(api_url)
        client_id = decode_object_from_bytes_if_needed(client_id)
        client_secret = decode_object_from_bytes_if_needed(client_secret)
        http_method = decode_object_from_bytes_if_needed(http_method)

        if not (api_url and client_id and client_secret):
            raise McCrawlerFetcherHardError("One or more required parameters are unset.")

        if not is_http_url(api_url):
            raise McCrawlerFetcherHardError(f"API URL '{api_url}' is not a HTTP(S) URL")

        if not http_method:
            http_method = 'GET'

        http_method = http_method.upper()

        uri = furl(api_url)
        if uri.args.get('client_id', None):
            raise McCrawlerFetcherHardError("Query already contains 'client_id'.")

        uri.args.add('client_id', client_id)

        if not str(uri.path):
            # Set to slash if it's unset
            uri.path.segments = ['']

        # Sort query params as per API spec
        sorted_args = []
        for key in sorted(uri.args.keys()):
            values = uri.args.getlist(key)
            for value in sorted(values):
                sorted_args.append({key: value})

        uri.args.clear()
        for sorted_arg in sorted_args:
            key, value = sorted_arg.popitem()
            uri.args.add(key, value)

        log.debug(f"Sorted query params: {uri.args}")

        log.debug(f"URI: {str(uri)}")

        api_url_path = str(uri.path)
        api_url_query = str(uri.query)

        unhashed_secret_key = f"{http_method}{client_id}{api_url_path}?{api_url_query}{client_secret}"
        log.debug(f"Unhashed secret key: {unhashed_secret_key}")

        signature = hashlib.sha1(unhashed_secret_key.encode('utf-8')).hexdigest()
        log.debug(f"Signature (hashed secret key): {signature}")

        uri.args.add('signature', signature)
        log.debug(f"API request URL: {str(uri)}")

        return str(uri)
Пример #40
0
    def get_follow_http_html_redirects(self, url: str) -> Response:
        """GET an URL while resolving HTTP / HTML redirects."""
        def __inner_follow_redirects(
                response_: Response,
                meta_redirects_left: int) -> Union[Response, None]:

            from mediawords.util.web.user_agent.html_redirects import (
                target_request_from_meta_refresh_url,
                target_request_from_archive_org_url,
                target_request_from_archive_is_url,
                target_request_from_linkis_com_url,
                target_request_from_alarabiya_url,
            )

            if response_ is None:
                raise McGetFollowHTTPHTMLRedirectsException(
                    "Response is None.")

            if response_.is_success():

                base_url = get_base_url(response_.request().url())

                html_redirect_functions = [
                    target_request_from_meta_refresh_url,
                    target_request_from_archive_org_url,
                    target_request_from_archive_is_url,
                    target_request_from_linkis_com_url,
                    target_request_from_alarabiya_url,
                ]
                for html_redirect_function in html_redirect_functions:
                    request_after_meta_redirect = html_redirect_function(
                        content=response_.decoded_content(),
                        archive_site_url=base_url,
                    )
                    if request_after_meta_redirect is not None:
                        if not urls_are_equal(
                                url1=response_.request().url(),
                                url2=request_after_meta_redirect.url()):

                            log.debug("URL after HTML redirects: %s" %
                                      request_after_meta_redirect.url())

                            orig_redirect_response = self.request(
                                request=request_after_meta_redirect)
                            redirect_response = orig_redirect_response

                            # Response might have its previous() already set due to HTTP redirects,
                            # so we have to find the initial response first
                            previous = None
                            for x in range(self.max_redirect() + 1):
                                previous = redirect_response.previous()
                                if previous is None:
                                    break
                                redirect_response = previous

                            if previous is not None:
                                raise McGetFollowHTTPHTMLRedirectsException(
                                    "Can't find the initial redirected response; URL: %s"
                                    % request_after_meta_redirect.url())

                            log.debug(
                                "Setting previous of URL %(url)s to %(previous_url)s"
                                % {
                                    'url': redirect_response.request().url(),
                                    'previous_url': response_.request().url(),
                                })
                            redirect_response.set_previous(response_)

                            meta_redirects_left = meta_redirects_left - 1

                            return __inner(
                                response_=orig_redirect_response,
                                meta_redirects_left=meta_redirects_left,
                            )

                # No <meta /> refresh, the current URL is the final one
                return response_

            else:
                log.debug("Request to %s was unsuccessful: %s" % (
                    response_.request().url(),
                    response_.status_line(),
                ))

                # Return the original URL and give up
                return None

        def __inner_redirects_exhausted(
                response_: Response) -> Union[Response, None]:

            if response_ is None:
                raise McGetFollowHTTPHTMLRedirectsException(
                    "Response is None.")

            # If one of the URLs that we've been redirected to contains another encoded URL, assume
            # that we're hitting a paywall and the URLencoded URL is the right one
            urls_redirected_to = []

            for x in range(self.max_redirect() + 1):
                previous = response_.previous()
                if previous is None:
                    break

                url_redirected_to = previous.request().url()
                encoded_url_redirected_to = quote(url_redirected_to)

                for redir_url in urls_redirected_to:
                    if re.search(pattern=re.escape(encoded_url_redirected_to),
                                 string=redir_url,
                                 flags=re.IGNORECASE | re.UNICODE):
                        log.debug(
                            """
                            Encoded URL %(encoded_url_redirected_to)s is a substring of another URL %(matched_url)s, so
                            I'll assume that %(url_redirected_to)s is the correct one.
                        """ % {
                                'encoded_url_redirected_to':
                                encoded_url_redirected_to,
                                'matched_url': redir_url,
                                'url_redirected_to': url_redirected_to,
                            })
                        return previous

                urls_redirected_to.append(url_redirected_to)

            # Return the original URL (unless we find a URL being a substring of another URL, see below)
            return None

        def __inner(response_: Response,
                    meta_redirects_left: int) -> Union[Response, None]:

            if response_ is None:
                raise McGetFollowHTTPHTMLRedirectsException(
                    "Response is None.")

            if meta_redirects_left > 0:
                return __inner_follow_redirects(
                    response_=response_,
                    meta_redirects_left=meta_redirects_left,
                )

            else:
                return __inner_redirects_exhausted(response_=response_)

        # ---

        url = decode_object_from_bytes_if_needed(url)

        if url is None:
            raise McGetFollowHTTPHTMLRedirectsException("URL is None.")

        url = fix_common_url_mistakes(url)

        if not is_http_url(url):
            raise McGetFollowHTTPHTMLRedirectsException(
                "URL is not HTTP(s): %s" % url)

        if self.max_redirect() == 0:
            raise McGetFollowHTTPHTMLRedirectsException(
                "User agent's max_redirect is 0, subroutine might loop indefinitely."
            )

        response = self.get(url)

        response_after_redirects = __inner(
            response_=response, meta_redirects_left=self.max_redirect())
        if response_after_redirects is None:
            # One of the redirects failed -- return original response
            return response

        else:
            return response_after_redirects
Пример #41
0
    def parallel_get(urls: List[str]) -> List[Response]:
        """GET multiple URLs in parallel."""

        # FIXME doesn't respect timing() and other object properties

        def __get_url_domain(url_: str) -> str:

            if not is_http_url(url_):
                return url_

            host = get_url_host(url_)

            name_parts = host.split('.')

            n = len(name_parts) - 1

            # for country domains, use last three parts of name
            if re.search(pattern=r"\...$", string=host):
                domain = '.'.join(
                    [name_parts[n - 2], name_parts[n - 1], name_parts[0]])

            elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)",
                           string=host):
                domain = url_

            else:
                domain = '.'.join([name_parts[n - 1], name_parts[n]])

            return domain.lower()

        def __get_scheduled_urls(
                urls_: List[str],
                per_domain_timeout_: int) -> List[_ParallelGetScheduledURL]:
            """Schedule the URLs by adding a { time => $time } field to each URL to make sure we obey the
            'per_domain_timeout'. Sort requests by ascending time."""
            domain_urls = {}

            for url_ in urls_:
                domain = __get_url_domain(url_=url_)
                if domain not in domain_urls:
                    domain_urls[domain] = []
                domain_urls[domain].append(url_)

            scheduled_urls = []

            for domain, urls_in_domain in domain_urls.items():
                time_ = 0
                for domain_url in urls_in_domain:
                    domain_url = _ParallelGetScheduledURL(url=domain_url,
                                                          time_=time_)
                    scheduled_urls.append(domain_url)

                    if time_ % 5 == 0:  # FIXME why 5?
                        time_ = time_ + per_domain_timeout_

            scheduled_urls = sorted(scheduled_urls, key=lambda x: x.time)

            return scheduled_urls

        # ---

        urls = decode_object_from_bytes_if_needed(urls)

        # Original implementation didn't raise on undefined / empty list of URLs
        if urls is None:
            return []
        if len(urls) == 0:
            return []

        # Remove duplicates from list while maintaining order because:
        # 1) We don't want to fetch the same URL twice
        # 2) URLs are being used as unique dictionary IDs later on
        urls_before_removing_duplicates = urls.copy()
        urls = list(OrderedDict.fromkeys(urls))
        if len(urls) != len(urls_before_removing_duplicates):
            log.warning("Some of the URLs are duplicate; URLs: %s" %
                        str(urls_before_removing_duplicates))

        # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid,
        # get() in a fork should be able to come up with a reasonable Response object for it
        for url in urls:
            if not is_http_url(url):
                raise McParallelGetException(
                    "URL %s is not a valid URL; URLs: %s" % (
                        url,
                        str(urls),
                    ))

        config = py_get_config()

        if 'web_store_num_parallel' not in config['mediawords']:
            raise McParallelGetException(
                '"web_store_num_parallel" is not set.')
        num_parallel = config['mediawords']['web_store_num_parallel']

        if 'web_store_timeout' not in config['mediawords']:
            raise McParallelGetException('"web_store_timeout" is not set.')
        timeout = config['mediawords']['web_store_timeout']

        if 'web_store_per_domain_timeout' not in config['mediawords']:
            raise McParallelGetException(
                '"web_store_per_domain_timeout" is not set.')
        per_domain_timeout = config['mediawords'][
            'web_store_per_domain_timeout']

        url_stack = __get_scheduled_urls(
            urls_=urls, per_domain_timeout_=per_domain_timeout)

        start_time = time.time()

        url_blocks = {}
        while len(url_stack) > 0:
            block_i = len(url_stack) % num_parallel

            if block_i not in url_blocks:
                url_blocks[block_i] = []

            url_blocks[block_i].append(url_stack.pop())

        pool = multiprocessing.Pool(processes=num_parallel)

        all_results = []
        for i, url_block in url_blocks.items():
            result = pool.apply_async(_parallel_get_web_store,
                                      args=(
                                          url_block,
                                          start_time,
                                          timeout,
                                      ))
            all_results.append(result)

        all_responses = []
        for result in all_results:
            responses = result.get()
            all_responses = all_responses + responses

        # No timeouts here because we trust the workers to timeout by themselves (by UserAgent)
        pool.close()
        pool.join()
        pool.terminate()

        # Sort URLs in parameter order
        # (if URLs weren't split into blocks, we could probably use map_async)
        response_url_map = {}
        for response in all_responses:
            url = response.scheduled_url.url
            response_url_map[url] = response.response

        sorted_responses = []
        for url in urls:
            if url not in response_url_map:
                raise McParallelGetException(
                    "URL %s is not in the response URL map %s." % (
                        url,
                        response_url_map,
                    ))

            sorted_responses.append(response_url_map[url])

        if len(urls) != len(sorted_responses):
            raise McParallelGetException(
                "Response count doesn't match URL count; responses: %s; URLs: %s"
                % (
                    sorted_responses,
                    urls,
                ))

        return sorted_responses
Пример #42
0
def test_is_http_url():
    # noinspection PyTypeChecker
    assert not mc_url.is_http_url(None)
    assert not mc_url.is_http_url('')

    assert not mc_url.is_http_url('abc')
    assert not mc_url.is_http_url('/abc')
    assert not mc_url.is_http_url('//abc')
    assert not mc_url.is_http_url('///abc')

    assert not mc_url.is_http_url('gopher://gopher.floodgap.com/0/v2/vstat')
    assert not mc_url.is_http_url('ftp://ftp.freebsd.org/pub/FreeBSD/')

    assert mc_url.is_http_url('http://cyber.law.harvard.edu/about')
    assert mc_url.is_http_url('https://github.com/berkmancenter/mediacloud')

    funky_url = (
        'http://Las%20Vegas%20mass%20shooting%20raises'
        '%20new%20doubts%20about%20safety%20of%20live%20entertainment')
    assert mc_url.is_http_url(funky_url) is False

    # URLs with port, HTTP auth, localhost
    assert mc_url.is_http_url(
        'https://*****:*****@domain.com:12345/path?query=string#fragment'
    )
    assert mc_url.is_http_url('http://*****:*****@') is False
Пример #43
0
def target_request_from_linkis_com_url(
        content: str, archive_site_url: str) -> Union[Request, None]:
    """Given the content of a linkis.com web page, find the original URL in the content, which may be in one of sereral
    places in the DOM, and return a request for said URL."""

    content = decode_object_from_bytes_if_needed(content)
    archive_site_url = decode_object_from_bytes_if_needed(archive_site_url)

    if content is None:
        return None

    if not re.match(pattern='^https?://[^/]*linkis.com/',
                    string=archive_site_url,
                    flags=re.IGNORECASE):
        return None

    # list of dom search patterns to find nodes with a url and the
    # attributes to use from those nodes as the url.
    #
    # for instance the first item matches:
    #
    #     <meta property="og:url" content="http://foo.bar">
    #
    try:
        html_parser = etree.HTMLParser()
        html_tree = etree.parse(StringIO(content), html_parser)

        dom_maps = [
            ('//meta[@property="og:url"]', 'content'),
            ('//a[@class="js-youtube-ln-event"]', 'href'),
            ('//iframe[@id="source_site"]', 'src'),
        ]

        for xpath, url_attribute in dom_maps:
            nodes = html_tree.xpath(xpath)

            if len(nodes) > 0:
                first_node = nodes[0]
                matched_url = first_node.get(url_attribute)
                if matched_url is not None:
                    if not re.match(pattern='^https?://linkis.com',
                                    string=matched_url,
                                    flags=re.IGNORECASE):

                        if is_http_url(matched_url):
                            return Request(method='GET', url=matched_url)
                        else:
                            log.error("URL matched, but is not HTTP(s): %s" %
                                      matched_url)

    except Exception as ex:
        log.warning("Unable to parse HTML for URL %s: %s" % (
            archive_site_url,
            str(ex),
        ))

    # As a last resort, look for the longUrl key in a JavaScript array
    matches = re.search(pattern=r'"longUrl":\s*"(?P<target_url>[^"]+)"',
                        string=content,
                        flags=re.IGNORECASE)
    if matches:
        target_url = matches.group('target_url')

        # kludge to de-escape \'d characters in javascript -- 99% of urls
        # are captured by the dom stuff above, we shouldn't get to this
        # point often
        target_url = target_url.replace('\\', '')

        if not re.match(pattern='^https?://linkis.com',
                        string=target_url,
                        flags=re.IGNORECASE):
            if is_http_url(target_url):
                return Request(method='GET', url=target_url)
            else:
                log.error("URL matched, but is not HTTP(s): %s" % target_url)

    log.warning("No URL found for linkis URL: %s" % archive_site_url)

    return None
Пример #44
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0:
            log.debug(
                (
                    'Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                    '(original URL: %(url)s): %(url_link_rel_canonical)s'
                ) % {
                    "url_after_redirects": url_after_redirects,
                    "url": url,
                    "url_link_rel_canonical": url_link_rel_canonical,
                }
            )

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])}

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)]

    return distinct_urls