Пример #1
0
def test_relaxed():
    assert parse_headers('attachment;',
                         relaxed=True).disposition == 'attachment'
    assert parse_headers('attachment; key=val;',
                         relaxed=True).disposition == 'attachment'
    cd = parse_headers('attachment; filename="spa  ced";', relaxed=True)
    assert cd.filename_unsafe == u'spa ced'
Пример #2
0
def test_relaxed():
    assert parse_headers(
        'attachment;', relaxed=True).disposition == 'attachment'
    assert parse_headers(
        'attachment; key=val;', relaxed=True).disposition == 'attachment'
    cd = parse_headers(
        'attachment; filename="spa  ced";',
        relaxed=True)
    assert cd.filename_unsafe == u'spa ced'
Пример #3
0
def test_location_fallback():
    assert parse_headers(
        None,
        location='https://foo/bar%c3%a9.py').filename_unsafe == u'baré.py'

    assert parse_headers(None, location='https://foo/').filename_unsafe == u''

    assert parse_headers(
        None,
        location='https://foo/%C3%A9toil%C3%A9/').filename_unsafe == u'étoilé'
Пример #4
0
def test_strict():
    # Trailing ; means the header is rejected
    assert parse_headers('attachment;').disposition == 'inline'
    assert parse_headers('attachment; key=val;').disposition == 'inline'
    try:
        cd = parse_headers('attachment; filename="spa  ced";')
    except ValueError:
        assert True
    else:
        assert False, cd
Пример #5
0
def test_strict():
    # Trailing ; means the header is rejected
    assert parse_headers('attachment;').disposition == 'inline'
    assert parse_headers('attachment; key=val;').disposition == 'inline'
    try:
        cd = parse_headers(
            'attachment; filename="spa  ced";')
    except ValueError:
        assert True
    else:
        assert False, cd
Пример #6
0
def test_location_fallback():
    assert parse_headers(
        None, location='https://foo/bar%c3%a9.py'
    ).filename_unsafe == u'baré.py'

    assert parse_headers(
        None, location='https://foo/'
    ).filename_unsafe == u''

    assert parse_headers(
        None, location='https://foo/%C3%A9toil%C3%A9/'
    ).filename_unsafe == u'étoilé'
Пример #7
0
def test_relaxed():
    assert parse_headers(
        'attachment;', relaxed=True).disposition == 'attachment'
    assert parse_headers(
        'attachment; key=val;', relaxed=True).disposition == 'attachment'
    cd = parse_headers(
        'attachment; filename="spa  ced";',
        relaxed=True)
    assert cd.filename_unsafe == u'spa ced'

    cd = parse_headers('attachment; filename="medium_SIEMEAE06658_1_PE_TAP2.png";filename*=UTF-8\'\'"medium_SIEMEAE06658_1_PE_TAP2.png"', relaxed=True)
    assert cd.filename_unsafe is None
Пример #8
0
def _filename(url, headers):
    """Given the URL and the HTTP headers received while fetching it,
    generate a reasonable name for the file. If no suitable name can be
    found, return None. (Either uses the Content-Disposition explicit
    filename or a filename from the URL.)
    """
    filename = None

    # Try to get filename from Content-Disposition header.
    heads = re.findall(r'^Content-Disposition:\s*(.*?)\r\n',
                       headers, re.I | re.M)
    if heads:
        filename = rfc6266.parse_headers(heads[-1]).filename_unsafe

    # Get filename from URL.
    if not filename:
        parts = urlparse.urlparse(url).path.split('/')
        if parts:
            filename = parts[-1]

    # Strip unsafe characters from path.
    if filename:
        filename = filename.strip()
        for sep in (os.sep, os.altsep):
            if sep:
                filename = filename.replace(sep, '_')
        for pat in FILENAME_REPLACE:
            filename = pat.sub('_', filename)
        if filename:
            return filename
Пример #9
0
def test_location_fallback():
    assert parse_headers(
        None, location='https://foo/bar%c3%a9.py'
    ).filename_unsafe == u'baré.py'

    assert parse_headers(
        None, location='https://foo/'
    ).filename_unsafe == u''

    assert parse_headers(
        None, location='https://foo/%C3%A9toil%C3%A9/'
    ).filename_unsafe == u'étoilé'

    assert parse_headers(
        None, location='http://vtv.vn/Content/Uploads/image/Trung%20khanh/Olympic%202012/SV%C4%90%20Olympic%202012%208.jpg'
    ).filename_unsafe == u"SVĐ Olympic 2012 8.jpg"
Пример #10
0
def download_to_local(url, dir_name):
    """Downloads remote resource given its URL.

    Args:
        url: the URL to the resource.
        dir_name: the directory on the local filesystem to save the resource.

    Returns:
        filename: the filename (relative to the dir_name) of the downloaded
            resource. It may be different from the name of the remote resource
            because of sanitization.
    """

    # TODO: be able to verify SSL certificates from some publishers
    with requests.get(url, stream=True) as r:
        r.raise_for_status()

        # Guesst the proper filename to use
        filename = ""
        # 1. Try to use the content-disposition header if available
        if "content-disposition" in r.headers:
            filename = rfc6266.parse_headers(r.headers["content-disposition"],
                                             relaxed=True).filename_unsafe
        # 2. Try to get it from the URL
        if filename == "":
            filename = url.rsplit("/", 1)[-1]
        # 3. Sanitize the filename, this handles empty filename right now
        filename = get_safe_filename(filename)

        # Download the file
        with open(os.path.join(dir_name, filename), "wb") as o:
            shutil.copyfileobj(r.raw, o)
    return filename
Пример #11
0
    def _find_attachments(self):
        """Retrieve attachments from the parsed body structure.

        We try to find and decode a file name for each attachment. If
        we failed, a generic name will be used (ie. part_1, part_2, ...).
        """
        for att in self.bs.attachments:
            attname = "part_%s" % att["pnum"]
            if "params" in att and att["params"] != "NIL":
                attname = u2u_decode.u2u_decode(att["params"][1]) \
                    .strip("\r\t\n")
            elif "disposition" in att and len(att["disposition"]) > 1:
                for pos, value in enumerate(att["disposition"][1]):
                    if not value.startswith("filename"):
                        continue
                    header = "%s; %s=%s" \
                        % (att['disposition'][0],
                           value,
                           att["disposition"][1][pos + 1].strip("\r\t\n"))
                    attname = parse_headers(header).filename_unsafe
                    if attname is None:
                        attname = u2u_decode.u2u_decode(
                            att["disposition"][1][pos + 1]
                        ).strip("\r\t\n")
                    break
            self.attachments[att["pnum"]] = attname
Пример #12
0
def _filename(url, headers):
    """Given the URL and the HTTP headers received while fetching it,
    generate a reasonable name for the file. If no suitable name can be
    found, return None. (Either uses the Content-Disposition explicit
    filename or a filename from the URL.)
    """
    filename = None

    # Try to get filename from Content-Disposition header.
    heads = re.findall(r'^Content-Disposition:\s*(.*?)\r\n', headers,
                       re.I | re.M)
    if heads:
        cdisp = rfc6266.parse_headers(heads[-1], relaxed=True)
        filename = cdisp.filename_unsafe

    # Get filename from URL.
    if not filename:
        parts = urlparse.urlparse(url).path.split('/')
        if parts:
            filename = parts[-1]

    # Strip unsafe characters from path.
    if filename:
        filename = filename.strip()
        for sep in (os.sep, os.altsep):
            if sep:
                filename = filename.replace(sep, '_')
        for pat in FILENAME_REPLACE:
            filename = pat.sub('_', filename)
        if filename:
            return filename
Пример #13
0
def download(url):
    print "Downloading %s" % url
    response = requests.get(url)
    fname = rfc6266.parse_headers(response.headers['content-disposition']).filename_unsafe
    with open(fname, "wb") as f:
        f.write(response.content)
    return fname
Пример #14
0
    def _find_attachments(self):
        """Retrieve attachments from the parsed body structure.

        We try to find and decode a file name for each attachment. If
        we failed, a generic name will be used (ie. part_1, part_2, ...).
        """
        for att in self.bs.attachments:
            attname = "part_%s" % att["pnum"]
            if "params" in att and att["params"] != "NIL":
                attname = u2u_decode.u2u_decode(att["params"][1]) \
                    .strip("\r\t\n")
            elif "disposition" in att and len(att["disposition"]) > 1:
                for pos, value in enumerate(att["disposition"][1]):
                    if not value.startswith("filename"):
                        continue
                    header = "%s; %s=%s" \
                        % (att['disposition'][0],
                           value,
                           att["disposition"][1][pos + 1].strip("\r\t\n"))
                    attname = parse_headers(header).filename_unsafe
                    if attname is None:
                        attname = u2u_decode.u2u_decode(
                            att["disposition"][1][pos + 1]).strip("\r\t\n")
                    break
            self.attachments[att["pnum"]] = attname
Пример #15
0
    def download_file(self, file_id):
        """
        Download the file.

        Args:
            file_id (:obj:`int`): File id

        Returns: 
            class:`models.responses.DownloadResponse` object
        """
        if not isinstance(file_id, int):
            raise TypeError('file_id must be an instance of int')
        url = self._download_file_base_url + str(file_id)
        response = self._perform_get_file_request(url)
        if response.status_code == 200:
            try:
                filename = rfc6266.parse_headers(
                    response.headers['Content-Disposition']).filename_unsafe
            except:
                filename = re.findall('filename=(.+)',
                                      response.headers['Content-Disposition'])
            return resp.DownloadResponse(filename, response.content)
        else:
            if response.status_code == 401:
                return resp.BaseResponse(
                    **{'error_code': 'authorization_error'})
            if response.status_code == 403 or response.status_code == 404:
                return resp.BaseResponse(
                    **{'error_code': 'access_denied_file'})
            else:
                return resp.BaseResponse(**{'error_code': 'ServerError'})
Пример #16
0
def get_filename(url, header):
    try:
        filename = parse_headers(header.get("content-disposition"))
        filename = filename.filename_unsafe
        assert filename is not None
    except AssertionError:
        filename = get_filename_from_url(url)
    return filename
Пример #17
0
def get_filename(entry):
    """Find the 'content-disposition' filename of a WARC entry."""
    for ext in DOC_EXT:
        if entry["mime"].lower().endswith(ext) or urlparse(entry["url"]).path.lower().endswith(".%s" % (ext)):
            for header, value in entry.record.status_headers.headers:
                if header.lower() == "content-disposition":
                    cd = rfc6266.parse_headers(value)
                    return cd.filename_unsafe
def get_filename(entry):
    """Find the 'content-disposition' filename of a WARC entry."""
    for ext in DOC_EXT:
        if entry["mime"].lower().endswith(ext) or urlparse(
                entry["url"]).path.lower().endswith(".%s" % (ext)):
            for header, value in entry.record.status_headers.headers:
                if header.lower() == "content-disposition":
                    cd = rfc6266.parse_headers(value)
                    return cd.filename_unsafe
Пример #19
0
	def filename(self):
		import urllib.parse
		if "content-disposition" in self.response.headers:
			try:
				import rfc6266
			except ImportError as e:
				print(e)
				print("Couldn't import rfc6266; not using content-disposition header")
			else:
				return rfc6266.parse_headers(self.response.headers["content-disposition"]).filename_unsafe
		return os.path.basename(urllib.parse.urlparse(self.url).path) or "index.html"
Пример #20
0
def download(url):
    print "Downloading %s" % url
    response = requests.get(url)
    if "Content-Disposition" in response.headers.keys():
        fname = rfc6266.parse_headers(
            response.headers['Content-Disposition']).filename_unsafe
    else:
        fname = url.split("/")[-1]
    with open(fname, "wb") as f:
        f.write(response.content)
    return fname
Пример #21
0
    def _digest_binary(self, data, headers):
        header = [
            i[1] for i in headers if i[0].lower() == 'content-disposition'
        ]
        if not len(header):
            return data

        cd = rfc6266.parse_headers(header[0], relaxed=True)
        return {
            'filename': cd.filename_unsafe,
            'disposition': cd.disposition,
            'content': data
        }
Пример #22
0
def download_url(url, counter):
    try:
        res = requests.get(url)
        filename = rfc6266.parse_headers(
            res.headers['Content-Disposition']).filename_unsafe
        if not filename:
            filename = f'{counter}.zip'

        print(f'[INFO] Get filename {filename}')

        with open(f'download_files/{filename}', 'wb+') as f:
            f.write(res.content)

    except Exception as e:
        print(e)
Пример #23
0
def test_parsing():
    assert parse_headers(None).disposition == 'inline'
    assert parse_headers('attachment').disposition == 'attachment'
    assert parse_headers('attachment; key=val').assocs['key'] == 'val'
    assert parse_headers(
        'attachment; filename=simple').filename_unsafe == 'simple'

    # test ISO-8859-1
    fname = parse_headers(u'attachment; filename="oyé"').filename_unsafe
    assert fname == u'oyé', repr(fname)

    cd = parse_headers(
        'attachment; filename="EURO rates";'
        ' filename*=utf-8\'\'%e2%82%ac%20rates')
    assert cd.filename_unsafe == u'€ rates'
    assert parse_headers('attachment; filename=""').filename_unsafe == None
Пример #24
0
def test_parsing():
    assert parse_headers(None).disposition == 'inline'
    assert parse_headers('attachment').disposition == 'attachment'
    assert parse_headers('attachment; key=val').assocs['key'] == 'val'
    assert parse_headers(
        'attachment; filename=simple').filename_unsafe == 'simple'

    # test ISO-8859-1
    fname = parse_headers(u'attachment; filename="oyé"').filename_unsafe
    assert fname == u'oyé', repr(fname)

    cd = parse_headers('attachment; filename="EURO rates";'
                       ' filename*=utf-8\'\'%e2%82%ac%20rates')
    assert cd.filename_unsafe == u'€ rates'
def parse_attachment(message_part, attachments=None):
    content_disposition = message_part.get("Content-Disposition", None)
    if content_disposition:
        try:
            cd = parse_headers(content_disposition, relaxed=True)
            if cd.disposition.lower() == "attachment":
                if not cd.assocs.has_key("filename"):
                    #print error or warning?
                    return None
                else:
                    file_data = message_part.get_payload(decode=True)
                    if not file_data:
                        payload = message_part.get_payload()
                        if isinstance(payload, list):
                            for msgobj in payload:
                                parse2(msgobj, attachments)
                            return None
                        print >>sys.stderr, message_part.get_payload()
                        print >>sys.stderr, message_part.get_content_charset()
                    attachment = StringIO(file_data)
                    attachment.content_type = message_part.get_content_type()
                    attachment.size = len(file_data)
                    attachment.name = cd.assocs['filename']
                    attachment.create_date = None
                    attachment.mod_date = None
                    attachment.read_date = None 
                    
                    for name, value in cd.assocs.iteritems():
                        if name == "create-date":
                            attachment.create_date = value  #TODO: datetime
                        elif name == "modification-date":
                            attachment.mod_date = value #TODO: datetime
                        elif name == "read-date":
                            attachment.read_date = value #TODO: datetime
                    
                    return attachment
                            
        except:
            print >>sys.stderr, "content_disposition:", content_disposition
            raise
    return None
Пример #26
0
def parse_attachment(message_part, attachments=None):
    content_disposition = message_part.get("Content-Disposition", None)
    if content_disposition:
        try:
            cd = parse_headers(content_disposition, relaxed=True)
            if cd.disposition.lower() == "attachment":
                if not "filename" in cd.assocs:
                    #print error or warning?
                    return None
                else:
                    file_data = message_part.get_payload(decode=True)
                    if not file_data:
                        payload = message_part.get_payload()
                        if isinstance(payload, list):
                            for msgobj in payload:
                                _parse2(msgobj, attachments)
                        return None  # PSIPHON: fixed conditional return
                    attachment = StringIO(file_data)
                    attachment.content_type = message_part.get_content_type()
                    attachment.size = len(file_data)
                    attachment.name = cd.assocs['filename']
                    attachment.create_date = None
                    attachment.mod_date = None
                    attachment.read_date = None

                    for name, value in cd.assocs.iteritems():
                        if name == "create-date":
                            attachment.create_date = value  # TODO: datetime
                        elif name == "modification-date":
                            attachment.mod_date = value  # TODO: datetime
                        elif name == "read-date":
                            attachment.read_date = value  # TODO: datetime

                    return attachment

        except:
            print >> sys.stderr, "content_disposition:", content_disposition
            raise
    return None
Пример #27
0
def download(target_dir, url):
    response = requests.get(url, stream=True)

    if not response.ok:
        raise DownloadError('Can\'t download %s: response status: %i'%\
            (url, response.status_code))

    fname = None
    cd = response.headers.get('Content-Disposition')
    if cd:
        fname = rfc6266.parse_headers(cd).filename_unsafe
    if not fname:
        fname = os.path.basename(url)

    log.info('Downloading %s' % fname)

    total = response.headers.get('content-length').strip()
    if total:
        total = int(total)
    path = os.path.join(target_dir, fname)

    with open(path, 'wb') as f:
        widgets = [
            progressbar.Percentage(), ' ',
            progressbar.Bar(), ' ',
            progressbar.ETA(), ' ',
            progressbar.FileTransferSpeed()
        ]
        pbar = progressbar.ProgressBar(widgets=widgets,
                                       max_value=total).start()
        size = 0
        for block in response.iter_content(1024):
            size += len(block)
            f.write(block)
            pbar.update(size)
        pbar.finish()
    return path
Пример #28
0
    def _find_attachments(self):
        for att in self.bs.attachments:
            attname = "part_%s" % att["pnum"]
            params = None
            key = None
            if "params" in att and att["params"] != "NIL":
                params = att["params"]
                key = "name"

            if key is None and "disposition" in att and len(att["disposition"]) > 1:
                params = att["disposition"][1]
                key = "filename"

            if key and params:
                for pos, value in enumerate(params):
                    if not value.startswith(key):
                        continue
                    header = "%s; %s=%s" % (att['disposition'][0], value,
                                            u2u_decode.u2u_decode(params[pos + 1]).strip("\r\t\n"))
                    attname = parse_headers(header).filename_unsafe
                    if attname is None:
                        attname = u2u_decode.u2u_decode(params[pos + 1]).strip("\r\t\n")
                    break
            self.attachments[att["pnum"]] = attname
Пример #29
0
    def get_info(self):
        logger.info('Getting piece config from url %r' % (self.url, ))

        r = requests.head(self.url.geturl(), verify=False)
        try:
            size = r.headers.get('content-length')
            size = int(size)
        except ValueError:
            raise Exception(
                'Size is invalid (%r), unable to segmented download.' %
                (size, ))
            #raise InvalidInputException('Size is invalid (%r), unable to segmented download.' % size)

        filename = None
        if r.headers.get('content-disposition'):
            filename = rfc6266.parse_headers(
                r.headers['content-disposition']).filename_unsafe

        if not filename:
            url_filename = self.url.path.split('?')[0].split('/')[-1]
            if url_filename:
                filename = url_filename

        return int(size), filename, r.headers.get('content-type')
    def __init__(self, fp=None, headers=None, outerboundary="",
                 environ=os.environ, keep_blank_values=0, strict_parsing=0):
        """Constructor.  Read multipart/* until last part.

        Arguments, all optional:

        fp              : file pointer; default: sys.stdin
            (not used when the request method is GET)

        headers         : header dictionary-like object; default:
            taken from environ as per CGI spec

        outerboundary   : terminating multipart boundary
            (for internal use only)

        environ         : environment dictionary; default: os.environ

        keep_blank_values: flag indicating whether blank values in
            percent-encoded forms should be treated as blank strings.
            A true value indicates that blanks should be retained as
            blank strings.  The default false value indicates that
            blank values are to be ignored and treated as if they were
            not included.

        strict_parsing: flag indicating what to do with parsing errors.
            If false (the default), errors are silently ignored.
            If true, errors raise a ValueError exception.

        """
        method = 'GET'
        self.keep_blank_values = keep_blank_values
        self.strict_parsing = strict_parsing
        if 'REQUEST_METHOD' in environ:
            method = environ['REQUEST_METHOD'].upper()
        self.qs_on_post = None
        if method == 'GET' or method == 'HEAD':
            if 'QUERY_STRING' in environ:
                qs = environ['QUERY_STRING']
            elif sys.argv[1:]:
                qs = sys.argv[1]
            else:
                qs = ""
            fp = StringIO(qs)
            if headers is None:
                headers = {'content-type':
                           "application/x-www-form-urlencoded"}
        if headers is None:
            headers = {}
            if method == 'POST':
                # Set default content-type for POST to what's traditional
                headers['content-type'] = "application/x-www-form-urlencoded"
            if 'CONTENT_TYPE' in environ:
                headers['content-type'] = environ['CONTENT_TYPE']
            if 'QUERY_STRING' in environ:
                self.qs_on_post = environ['QUERY_STRING']
            if 'CONTENT_LENGTH' in environ:
                headers['content-length'] = environ['CONTENT_LENGTH']
        self.fp = fp or sys.stdin
        self.headers = headers
        self.outerboundary = outerboundary

        # Process content-disposition header
        cdisp, pdict = "", {}
        if 'content-disposition' in self.headers and rfc6266:
            cd = rfc6266.parse_headers(self.headers['content-disposition'], relaxed=True)
            cdisp, pdict = cd.disposition, cd.assocs
        elif 'content-disposition' in self.headers:
            cdisp, pdict = parse_header(self.headers['content-disposition'])
        self.disposition = cdisp
        self.disposition_options = pdict
        self.name = None
        if 'name' in pdict:
            self.name = pdict['name']
        self.filename = None
        if 'filename' in pdict:
            self.filename = pdict['filename']
        if 'filename*' in pdict:
            self.filename = pdict['filename*'].string
        if self.filename and '&' in self.filename:
            from HTMLParser import HTMLParser
            self.filename = HTMLParser().unescape(self.filename)
        if isinstance(self.filename, unicode):
            self.filename = self.filename.encode('utf8')

        # Process content-type header
        #
        # Honor any existing content-type header.  But if there is no
        # content-type header, use some sensible defaults.  Assume
        # outerboundary is "" at the outer level, but something non-false
        # inside a multi-part.  The default for an inner part is text/plain,
        # but for an outer part it should be urlencoded.  This should catch
        # bogus clients which erroneously forget to include a content-type
        # header.
        #
        # See below for what we do if there does exist a content-type header,
        # but it happens to be something we don't understand.
        if 'content-type' in self.headers:
            ctype, pdict = parse_header(self.headers['content-type'])
        elif self.outerboundary or method != 'POST':
            ctype, pdict = "text/plain", {}
        else:
            ctype, pdict = 'application/x-www-form-urlencoded', {}
        self.type = ctype
        self.type_options = pdict
        self.innerboundary = ""
        if 'boundary' in pdict:
            self.innerboundary = pdict['boundary']
        clen = -1
        if 'content-length' in self.headers:
            try:
                clen = int(self.headers['content-length'])
            except ValueError:
                pass
            if maxlen and clen > maxlen:
                raise ValueError, 'Maximum content length exceeded'
        self.length = clen

        self.list = self.file = None
        self.done = 0
        if ctype == 'application/x-www-form-urlencoded':
            self.read_urlencoded()
        elif ctype[:10] == 'multipart/':
            self.read_multi(environ, keep_blank_values, strict_parsing)
        else:
            self.read_single()
Пример #31
0
def shapefiles(base='.'):
    """
    Update any out-of-date shapefiles.
    """
    headers = {
        'User-Agent':
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'
    }

    def process(slug, config, url, data_file_path):
        # We can only process KML, KMZ and ZIP files.
        extension = os.path.splitext(data_file_path)[1]
        if extension in ('.kml', '.kmz', '.zip'):
            repo_path = os.path.dirname(data_file_path)
            while not os.path.exists(os.path.join(
                    repo_path, '.git')) and not repo_path == '/':
                repo_path = os.path.join(repo_path, '..')
            repo_path = os.path.realpath(repo_path)

            directory = dirname(config['file'])

            # Remove old files.
            for basename in os.listdir(directory):
                if basename not in ('.DS_Store', '__pycache__',
                                    'definition.py', 'LICENSE.txt', 'data.kml',
                                    'data.kmz', 'data.zip'):
                    os.unlink(os.path.join(directory, basename))

            files_to_add = []

            # Unzip any zip file.
            error_thrown = False
            if extension == '.zip':
                try:
                    zip_file = ZipFile(data_file_path)
                    for name in zip_file.namelist():
                        # Don't extract directories.
                        if name[-1] == '/':
                            continue
                        # Flatten the zip file hierarchy.
                        extension = os.path.splitext(name)[1]
                        if extension in ('.kml', '.kmz'):
                            basename = 'data%s' % extension  # assumes one KML or KMZ file per archive
                        else:
                            basename = os.path.basename(
                                name)  # assumes no collisions across hierarchy
                        # Extract only matching shapefiles.
                        if 'basename' in config and basename.split(
                                os.extsep, 1)[0] != config['basename']:
                            continue
                        with open(os.path.join(directory, basename),
                                  'wb') as f:
                            with zip_file.open(name, 'r') as fp:
                                if 'skip_crc32' in config:
                                    fp._expected_crc = None
                                f.write(fp.read())
                        if extension not in ('.kml', '.kmz'):
                            files_to_add.append(
                                os.path.join(directory, basename))
                except BadZipfile as e:
                    error_thrown = True
                    print('Bad ZIP file %s %s\n' % (e, url))
                finally:
                    os.unlink(data_file_path)

            # Unzip any KMZ file.
            kmz_file_path = os.path.join(directory, 'data.kmz')
            if not error_thrown and os.path.exists(kmz_file_path):
                try:
                    zip_file = ZipFile(kmz_file_path)
                    for name in zip_file.namelist():
                        # A KMZ file contains a single KML file and other supporting files.
                        # @see https://developers.google.com/kml/documentation/kmzarchives
                        if os.path.splitext(name)[1] == '.kml':
                            with open(os.path.join(directory, 'data.kml'),
                                      'wb') as f:
                                f.write(zip_file.read(name))
                except BadZipfile:
                    error_thrown = True
                    print('Bad KMZ file %s\n' % url)
                finally:
                    os.unlink(kmz_file_path)

            if not error_thrown:
                shp_file_path = glob(os.path.join(directory, '*.shp'))

                # Convert any KML to shapefile.
                if not shp_file_path:
                    kml_file_path = os.path.join(directory, 'data.kml')
                    if os.path.exists(kml_file_path):
                        result = run('ogrinfo -q %s | grep -v "3D Point"' %
                                     kml_file_path,
                                     hide='out').stdout
                        if result.count('\n') > 1:
                            print('Too many layers %s' % url)
                        else:
                            layer = re.search(r'\A\d+: (.+?) \(',
                                              result).group(1)
                            run('ogr2ogr -f "ESRI Shapefile" %s %s -nlt POLYGON "%s"'
                                % (directory, kml_file_path, layer),
                                echo=True)
                            for name in glob(
                                    os.path.join(directory,
                                                 '*.[dps][bhr][fjpx]')):
                                files_to_add.append(name)
                            os.unlink(kml_file_path)

                # Merge multiple shapefiles into one.
                if len(shp_file_path) > 1:
                    for name in shp_file_path:
                        run('ogr2ogr -f "ESRI Shapefile" %s %s -update -append -nln Boundaries'
                            % (directory, name),
                            echo=True)
                        basename = os.path.splitext(os.path.basename(name))[0]
                        for name in glob(
                                os.path.join(directory,
                                             '%s.[dps][bhr][fjnpx]' %
                                             basename)):
                            files_to_add.remove(name)
                            os.unlink(name)

                shp_file_path = glob(os.path.join(directory, '*.shp'))
                if shp_file_path:
                    shp_file_path = shp_file_path[0]
                if shp_file_path and os.path.exists(shp_file_path):
                    # Convert any 3D shapefile into 2D.
                    result = run('ogrinfo -q %s' % shp_file_path,
                                 hide='out').stdout
                    if result.count('\n') > 1:
                        print('Too many layers %s' % url)
                    elif re.search('3D Polygon', result):
                        run('ogr2ogr -f "ESRI Shapefile" -overwrite %s %s -nlt POLYGON'
                            % (directory, shp_file_path),
                            echo=True)
                        for name in list(files_to_add):
                            if not os.path.exists(name):
                                files_to_add.remove(name)

                    # Replace "Double_Stereographic" with "Oblique_Stereographic".
                    prj_file_path = os.path.splitext(shp_file_path)[0] + '.prj'
                    if prj_file_path and os.path.exists(prj_file_path):
                        with open(prj_file_path) as f:
                            prj = f.read()
                        if 'Double_Stereographic' in prj:
                            with open(prj_file_path, 'w') as f:
                                f.write(
                                    prj.replace('Double_Stereographic',
                                                'Oblique_Stereographic'))
                    elif 'prj' in config:
                        with open(prj_file_path, 'w') as f:
                            f.write(requests.get(config['prj']).text)
                        files_to_add.append(prj_file_path)
                    else:
                        print('No PRJ file %s' % url)

                # Update last updated timestamp.
                definition_path = os.path.join(directory, 'definition.py')
                with open(definition_path) as f:
                    definition = f.read()
                with open(definition_path, 'w') as f:
                    f.write(
                        re.sub(r'(?<=last_updated=date\()[\d, ]+',
                               last_updated.strftime('%Y, %-m, %-d'),
                               definition))

                # Print notes.
                if 'notes' in config:
                    print('%s\n%s\n' % (config['file'], config['notes']))
        else:
            print('Unrecognized extension %s\n' % url)

    # Retrieve shapefiles.
    processed = set()
    for slug, config in registry(base).items():
        if config['file'] not in processed and 'data_url' in config:
            processed.add(config['file'])
            url = config['data_url']
            result = urlparse(url)

            if result.scheme == 'ftp':
                # Get the last modified timestamp.
                ftp = FTP(result.hostname)
                ftp.login(result.username, result.password)
                last_modified = ftp.sendcmd('MDTM %s' % result.path)

                # Parse the timestamp as a date.
                last_updated = datetime.strptime(last_modified[4:],
                                                 '%Y%m%d%H%M%S').date()

                if config['last_updated'] < last_updated:
                    # Determine the file extension.
                    extension = os.path.splitext(url)[1]

                    # Set the new file's name.
                    data_file_path = os.path.join(dirname(config['file']),
                                                  'data%s' % extension)

                    # Download new file.
                    ftp.retrbinary('RETR %s' % result.path,
                                   open(data_file_path, 'wb').write)
                    ftp.quit()

                    process(slug, config, url, data_file_path)
            else:
                # Get the last modified timestamp.
                arguments = {}
                if result.username:
                    url = '%s://%s%s' % (result.scheme, result.hostname,
                                         result.path)
                    arguments['auth'] = (result.username, result.password)
                try:
                    response = requests.head(url, headers=headers, **arguments)
                except requests.exceptions.SSLError:
                    response = requests.head(url,
                                             headers=headers,
                                             verify=False,
                                             **arguments)
                # If HEAD requests are not properly supported.
                if response.status_code in (204, 403, 405, 500) or (
                        response.status_code == 302
                        and '404' in response.headers['Location']):
                    response = requests.get(url,
                                            headers=headers,
                                            stream=True,
                                            **arguments)

                last_modified = response.headers.get('last-modified')

                # Parse the timestamp as a date.
                if last_modified:
                    last_updated = datetime.strptime(
                        last_modified, '%a, %d %b %Y %H:%M:%S GMT')
                else:
                    last_updated = datetime.now()
                last_updated = last_updated.date()

                if config['last_updated'] > last_updated:
                    print('%s are more recent than the source (%s > %s)\n' %
                          (slug, config['last_updated'], last_updated))
                elif config['last_updated'] < last_updated:
                    # Determine the file extension.
                    if 'content-disposition' in response.headers:
                        filename = parse_headers(
                            response.headers['content-disposition']
                        ).filename_unsafe
                    else:
                        filename = url

                    extension = os.path.splitext(filename)[1].lower()
                    if not extension:
                        if response.headers[
                                'content-type'] == 'application/vnd.google-earth.kml+xml; charset=utf-8':
                            extension = '.kml'

                    # Set the new file's name.
                    data_file_path = os.path.join(dirname(config['file']),
                                                  'data%s' % extension)

                    # Download new file.
                    arguments['stream'] = True
                    try:
                        response = requests.get(url,
                                                headers=headers,
                                                **arguments)
                    except requests.exceptions.SSLError:
                        response = requests.get(url,
                                                headers=headers,
                                                verify=False,
                                                **arguments)
                    with open(data_file_path, 'wb') as f:
                        for chunk in response.iter_content():
                            f.write(chunk)

                    process(slug, config, url, data_file_path)
Пример #32
0
 def roundtrip(filename):
     return parse_headers(build_header(filename)).filename_unsafe
def download(src, dst):

	do_download = True
	if os.path.exists(dst):

		logger.info("- %s: local file exists (%s)", src, dst)

		do_download = False

		"""
		import xdg.Mime

		path = dst

		try:
			filemime = xdg.Mime.get_type2(path)
		except AttributeError:
			filemime = xdg.Mime.get_type(path)

		filetype = str(filemime)
		logger.info(" file type: %s", filetype)

		if filetype == "application/gzip":
			cmd = ["gunzip", "--stdout", dst]
			proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL)
			res = proc.wait()
			if res == 0:
				do_download = False

		elif filetype == "application/zip":
			import zipfile
			try:
				z = zipfile.ZipFile(dst)
				l = z.infolist()
				do_download = False
			except Exception as e:
				logger.exception("NG zip file: %s->%s (%s)", src, dst, e)
				pass
		else:
			logger.error(" Unsupported file type: %s", filetype)
		"""

	if do_download:
		logger.debug("Downloading %s", src)
		try:
			x, headers = urllib.request.urlretrieve(src, dst + ".tmp")

			filename = None
			if "Content-Disposition" in headers:
				disp = headers["Content-Disposition"]
				filename = rfc6266.parse_headers(disp, relaxed=True).filename_unsafe
				if filename is not None:
					filename = os.path.basename(filename)

			if filename is None:
				filename = os.path.basename(src)

			with io.open(dst + ".filename", "w", encoding="utf-8") as fo:
				fo.write(filename)

			os.rename(dst + ".tmp", dst)

			logger.debug("Downloaded %s", filename)
			#logger.debug("Headers: %s", headers)
		except Exception as e:
			logger.exception("NG download: %s", src)
Пример #34
0
def shapefiles(base='.'):
  """
  Update any out-of-date shapefiles.
  """
  headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'}

  def process(slug, config, url, data_file_path):
    # We can only process KML, KMZ and ZIP files.
    extension = os.path.splitext(data_file_path)[1]
    if extension in ('.kml', '.kmz', '.zip'):
      repo_path = os.path.dirname(data_file_path)
      while not os.path.exists(os.path.join(repo_path, '.git')) and not repo_path == '/':
        repo_path = os.path.join(repo_path, '..')
      repo_path = os.path.realpath(repo_path)

      repo = Repo(repo_path)
      index = repo.index
      directory = dirname(config['file'])

      # Remove old files.
      for basename in os.listdir(directory):
        if basename not in ('.DS_Store', 'definition.py', 'LICENSE.txt', 'data.kml', 'data.kmz', 'data.zip'):
          os.unlink(os.path.join(directory, basename))
          index.remove([os.path.relpath(os.path.join(directory, basename), repo_path)])

      files_to_add = []

      # Unzip any zip file.
      error_thrown = False
      if extension == '.zip':
        try:
          zip_file = ZipFile(data_file_path)
          for name in zip_file.namelist():
            # Flatten the zip file hierarchy.
            extension = os.path.splitext(name)[1]
            if extension in ('.kml', '.kmz'):
              basename = 'data%s' % extension  # assumes one KML or KMZ file per archive
            else:
              basename = os.path.basename(name)  # assumes no collisions across hierarchy
            with open(os.path.join(directory, basename), 'wb') as f:
              f.write(zip_file.read(name))
            if extension not in ('.kml', '.kmz'):
              files_to_add.append(os.path.join(directory, basename))
        except BadZipfile:
          error_thrown = True
          print('Bad ZIP file %s\n' % url)
        finally:
          os.unlink(data_file_path)

      # Unzip any KMZ file.
      kmz_file_path = os.path.join(directory, 'data.kmz')
      if not error_thrown and os.path.exists(kmz_file_path):
        try:
          zip_file = ZipFile(kmz_file_path)
          for name in zip_file.namelist():
            # A KMZ file contains a single KML file and other supporting files.
            # @see https://developers.google.com/kml/documentation/kmzarchives
            if os.path.splitext(name)[1] == '.kml':
              with open(os.path.join(directory, 'data.kml'), 'wb') as f:
                f.write(zip_file.read(name))
        except BadZipfile:
          error_thrown = True
          print('Bad KMZ file %s\n' % url)
        finally:
          os.unlink(kmz_file_path)

      if not error_thrown:
        # Convert any KML to shapefile.
        kml_file_path = os.path.join(directory, 'data.kml')
        if os.path.exists(kml_file_path):
          result = run('ogrinfo -q %s | grep -v "3D Point"' % kml_file_path, hide='out').stdout
          if result.count('\n') > 1:
            print('Too many layers %s' % url)
          else:
            layer = re.search('\A\d+: (\S+)', result).group(1)
            run('ogr2ogr -f "ESRI Shapefile" %s %s -nlt POLYGON %s' % (directory, kml_file_path, layer), echo=True)
            for name in glob(os.path.join(directory, '*.[dps][bhr][fjpx]')):
              files_to_add.append(name)
            os.unlink(kml_file_path)

        # Merge multiple shapefiles into one.
        names = glob(os.path.join(directory, '*.shp'))
        if len(names) > 1:
          for name in names:
            run('ogr2ogr -f "ESRI Shapefile" %s %s -update -append -nln Boundaries' % (directory, name), echo=True)
            basename = os.path.splitext(os.path.basename(name))[0]
            for name in glob(os.path.join(directory, '%s.[dps][bhr][fjnpx]' % basename)):
              files_to_add.remove(name)
              os.unlink(name)

        shp_file_path = glob(os.path.join(directory, '*.shp'))
        if shp_file_path:
          shp_file_path = shp_file_path[0]
        if shp_file_path and os.path.exists(shp_file_path):
          # Convert any 3D shapefile into 2D.
          result = run('ogrinfo -q %s' % shp_file_path, hide='out').stdout
          if result.count('\n') > 1:
            print('Too many layers %s' % url)
          elif re.search('3D Polygon', result):
            run('ogr2ogr -f "ESRI Shapefile" -overwrite %s %s -nlt POLYGON' % (directory, shp_file_path), echo=True)
            for name in list(files_to_add):
              if not os.path.exists(name):
                files_to_add.remove(name)

          # Replace "Double_Stereographic" with "Oblique_Stereographic".
          prj_file_path = os.path.splitext(shp_file_path)[0] + '.prj'
          if prj_file_path and os.path.exists(prj_file_path):
            with open(prj_file_path) as f:
              prj = f.read()
            if 'Double_Stereographic' in prj:
              with open(prj_file_path, 'w') as f:
                f.write(prj.replace('Double_Stereographic', 'Oblique_Stereographic'))
          elif config.get('prj'):
            with open(prj_file_path, 'w') as f:
              f.write(requests.get(config['prj']).content)
            files_to_add.append(prj_file_path)
          else:
            print('No PRJ file %s' % url)

          # Run any additional commands on the shapefile.
          if config.get('ogr2ogr'):
            run('ogr2ogr -f "ESRI Shapefile" -overwrite %s %s %s' % (directory, shp_file_path, config['ogr2ogr']), echo=True)
            for name in list(files_to_add):
              if not os.path.exists(name):
                files_to_add.remove(name)

        # Add files to git.
        index.add([os.path.relpath(name, repo_path) for name in files_to_add])

        # Update last updated timestamp.
        definition_path = os.path.join(directory, 'definition.py')
        with open(definition_path) as f:
          definition = f.read()
        with open(definition_path, 'w') as f:
          f.write(re.sub('(?<=last_updated=date\()[\d, ]+', last_updated.strftime('%Y, %-m, %-d'), definition))

        # Print notes.
        notes = []
        if config.get('notes'):
          notes.append(config['notes'])
        if notes:
          print('%s\n%s\n' % (slug, '\n'.join(notes)))
    else:
      print('Unrecognized extension %s\n' % url)

  # Retrieve shapefiles.
  for slug, config in registry(base).items():
    if config.get('data_url'):
      url = config['data_url']
      result = urlparse(url)

      if result.scheme == 'ftp':
        # Get the last modified timestamp.
        ftp = FTP(result.hostname)
        ftp.login(result.username, result.password)
        last_modified = ftp.sendcmd('MDTM %s' % result.path)

        # Parse the timestamp as a date.
        last_updated = datetime.strptime(last_modified[4:], '%Y%m%d%H%M%S').date()

        if config['last_updated'] < last_updated:
          # Determine the file extension.
          extension = os.path.splitext(url)[1]

          # Set the new file's name.
          data_file_path = os.path.join(dirname(config['file']), 'data%s' % extension)

          # Download new file.
          ftp.retrbinary('RETR %s' % result.path, open(data_file_path, 'wb').write)
          ftp.quit()

          process(slug, config, url, data_file_path)
      else:
        # Get the last modified timestamp.
        arguments = {'allow_redirects': True}
        if result.username:
          url = '%s://%s%s' % (result.scheme, result.hostname, result.path)
          arguments['auth'] = (result.username, result.password)
        response = requests.head(url, headers=headers, **arguments)
        if response.status_code == 405:  # if HEAD requests are not allowed
          response = requests.get(url, headers=headers, **arguments)
        last_modified = response.headers.get('last-modified')

        # Parse the timestamp as a date.
        if last_modified:
          last_updated = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S GMT')
        else:
          last_updated = datetime.now()
        last_updated = last_updated.date()

        if config['last_updated'] > last_updated:
          print('%s are more recent than the source (%s > %s)\n' % (slug, config['last_updated'], last_updated))
        elif config['last_updated'] < last_updated:
          # Determine the file extension.
          if response.headers.get('content-disposition'):
            filename = parse_headers(response.headers['content-disposition']).filename_unsafe
          else:
            filename = url
          extension = os.path.splitext(filename)[1].lower()

          # Set the new file's name.
          data_file_path = os.path.join(dirname(config['file']), 'data%s' % extension)

          # Download new file.
          arguments['stream'] = True
          response = requests.get(url, headers=headers, **arguments)
          with open(data_file_path, 'wb') as f:
            for chunk in response.iter_content():
              f.write(chunk)

          process(slug, config, url, data_file_path)
Пример #35
0
 def roundtrip(filename):
     return parse_headers(build_header(filename)).filename_unsafe