Пример #1
0
 def test_is_s3_url(self):
     self.assertTrue(
         md5s3stash.is_s3_url('https://s3.amazonaws.com/adlkfj'))
     self.assertTrue(
         md5s3stash.is_s3_url('https://s3-us-west-2.amazonaws.com/adlkfj'))
     self.assertFalse(
         md5s3stash.is_s3_url('https://s3.amazonas.com/adlkfj'))
Пример #2
0
def link_is_to_image(url, auth=None):
    """Check if the link points to an image content type.
    Return True or False accordingly
    """
    if md5s3stash.is_s3_url(url):
        response = requests.head(url, allow_redirects=True)
    else:
        response = requests.head(url, allow_redirects=True, auth=auth)
    if response.status_code != 200:
        return False
    content_type = response.headers.get("content-type", None)
    if not content_type:
        return False
    reg_type = content_type.split("/", 1)[0].lower()
    # situation where a server returned 'text/html' to HEAD requests
    # but returned 'image/jpeg' for GET.
    # try a slower GET if not image type
    if reg_type != "image":
        response = requests.get(url, allow_redirects=True, auth=auth)
        if response.status_code != 200:
            return False
        content_type = response.headers.get("content-type", None)
        if not content_type:
            return False
        reg_type = content_type.split("/", 1)[0].lower()
    return reg_type == "image"
Пример #3
0
def link_is_to_image(doc_id, url, auth=None):
    '''Check if the link points to an image content type.
    Return True or False accordingly
    '''
    if md5s3stash.is_s3_url(url):
        response = requests.head(url, allow_redirects=True)
    else:
        response = requests.head(url, allow_redirects=True, auth=auth)
    # have a server that returns a 403 here, does have content-type of
    # text/html. Dropping this test here. requests throws if can't connect
    if response.status_code != 200:
        # many servers do not support HEAD requests, try get
        if md5s3stash.is_s3_url(url):
            response = requests.get(url, allow_redirects=True)
        else:
            response = requests.get(url, allow_redirects=True, auth=auth)
        if response.status_code != 200:
            raise ImageHTTPError(
                'HTTP ERROR: {}'.format(response.status_code), doc_id=doc_id)
    content_type = response.headers.get('content-type', None)
    if not content_type:
        return False
    reg_type = content_type.split('/', 1)[0].lower()
    # situation where a server returned 'text/html' to HEAD requests
    # but returned 'image/jpeg' for GET.
    # try a slower GET if not image type
    if reg_type != 'image':
        response = requests.get(url, allow_redirects=True, auth=auth)
        if response.status_code != 200:
            raise ImageHTTPError(
                'HTTP ERROR: {}'.format(response.status_code), doc_id=doc_id)
        content_type = response.headers.get('content-type', None)
        if not content_type:
            return False
        reg_type = content_type.split('/', 1)[0].lower()
    return reg_type == 'image'
Пример #4
0
def link_is_to_image(doc_id, url, auth=None):
    '''Check if the link points to an image content type.
    Return True or False accordingly.
    '''
    if md5s3stash.is_s3_url(url):
        response = requests.head(url, allow_redirects=True)
    else:
        response = requests.head(url, allow_redirects=True, auth=auth)
    # have a server that returns a 403 here, does have content-type of
    # text/html. Dropping this test here. requests throws if can't connect
    if response.status_code != 200:
        # many servers do not support HEAD requests, try get
        if md5s3stash.is_s3_url(url):
            response = requests.get(url, allow_redirects=True)
        else:
            response = requests.get(url, allow_redirects=True, auth=auth)
        if response.status_code != 200:
            raise ImageHTTPError('HTTP ERROR: {}'.format(response.status_code),
                                 doc_id=doc_id)
    content_type = response.headers.get('content-type', None)
    if not content_type:
        return False
    reg_type = content_type.split('/', 1)[0].lower()
    # situation where a server returned 'text/html' to HEAD requests
    # but returned 'image/jpeg' for GET.
    # try a slower GET if not image type
    if reg_type != 'image':
        response = requests.get(url, allow_redirects=True, auth=auth)
        if response.status_code != 200:
            raise ImageHTTPError('HTTP ERROR: {}'.format(response.status_code),
                                 doc_id=doc_id)
        content_type = response.headers.get('content-type', None)
        if not content_type:
            return False
        reg_type = content_type.split('/', 1)[0].lower()
    return reg_type == 'image'
Пример #5
0
 def test_is_s3_url(self):
     self.assertTrue(md5s3stash.is_s3_url('https://s3.amazonaws.com/adlkfj'))
     self.assertTrue(md5s3stash.is_s3_url('https://s3-us-west-2.amazonaws.com/adlkfj'))
     self.assertFalse(md5s3stash.is_s3_url('https://s3.amazonas.com/adlkfj'))