Exemplo n.º 1
0
    def _get_block_data(self, block):
        if block in self._blocks_cache_queue:
            return self._blocks_cache[block]

        if not (block < len(self._index)):
            raise ZeexOutOfBoundExceptions("Requested block not in file",
                                           block)

        sys.stderr.write('Block: {}'.format(block))
        offset = self._index[block] + self.headersize

        if (block + 1) < len(self._index):
            csize = self._index[block + 1] + self.headersize - offset
        else:
            csize = self.header.cdata_length + self.headersize - offset

        self._infile.seek(offset)
        compressed = self._infile.read(csize)
        data = lzma.decompress(compressed)

        if len(self._blocks_cache_queue) > self._max_cached_blocks:
            del self._blocks_cache[self._blocks_cache_queue[0]]
            del self._blocks_cache_queue[0]
            self._blocks_cache_queue.append(block)
            self._blocks_cache[block] = data

        return data
Exemplo n.º 2
0
def xz_decompress(data):
    '''decompress xz `data` using backports.lzma, or if that's not
    available then the commandline `xz --decompress` tool

    '''
    if xz is not None:
        try:
            bigdata = xz.decompress(data)
            data = bigdata
        except:
            logger.error('decompress of %s bytes failed', len(data))
            raise

    else:
        ## launch xz child
        xz_child = subprocess.Popen(['xz', '--decompress'],
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
        ## use communicate to pass the data incrementally to the child
        ## while reading the output, to avoid blocking
        data, errors = xz_child.communicate(data)
        assert not errors, errors

    return data
    def open_db(self):
        self.terms_ldb = leveldb.LevelDB(self.terms_fl)
        self.docs_ldb = leveldb.LevelDB(self.docs_fl)

        self.doc_buffer_size = 0
        self.term_buffer_size = 0

        #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size)
        self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size
        self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype="S%d" % self.max_term_size)
        self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype=np.int64)
        self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size, dtype=np.int64)

        if self.compression == COMPRESSION.NONE:
            self.compress = lambda string: string
            self.decompress = lambda string: string
        elif self.compression == COMPRESSION.ZLIB:
            import zlib
            self.compress = lambda string: zlib.compress(string, self.compression_level)
            self.decompress = lambda string: zlib.decompress(string)
        elif self.compression == COMPRESSION.LZMA:
            import backports.lzma as lzma
            self.compress = lambda string: lzma.compress(bytearray(string), format=lzma.FORMAT_RAW)
            self.decompress = lambda data: lzma.decompress(data, format=lzma.FORMAT_RAW)
        elif self.compression == COMPRESSION.LZ4R:
            import lz4
            self.compress = lambda string: lz4.compress(string)
            self.decompress = lambda string: lz4.decompress(string)
        elif self.compression == COMPRESSION.LZ4H:
            import lz4
            self.compress = lambda string: lz4.compressHC(string)
            self.decompress = lambda string: lz4.decompress(string)
        else:
            raise Exception("Wrong compression type %r" % self.compression)
Exemplo n.º 4
0
def xz_decompress(data):
    '''decompress xz `data` using backports.lzma, or if that's not
    available then the commandline `xz --decompress` tool

    '''
    if xz is not None:
        try:
            bigdata = xz.decompress(data)
            data = bigdata
        except:
            logger.error('decompress of %s bytes failed', len(data))
            raise

    else:
        ## launch xz child
        xz_child = subprocess.Popen(
            ['xz', '--decompress'],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        ## use communicate to pass the data incrementally to the child
        ## while reading the output, to avoid blocking
        data, errors = xz_child.communicate(data)
        assert not errors, errors

    return data
Exemplo n.º 5
0
 def init_index(self):
     if self.is_multi_deb:
         self.index_path = '/'.join([self.components[0]])
         url = '/'.join([self.uri, self.components[0], 'Packages.xz'])
     else:
         self.index_path = '/'.join([
             'dists', self.distribution, self.components[0],
             'binary-' + self.arch
         ])
         url = '/'.join([
             self.uri, 'dists', self.distribution, self.components[0],
             'binary-' + self.arch, 'Packages.xz'
         ])
     content = httpClient.get(url)
     index_data = lzma.decompress(content)
     self.index_list = self.parse_package_file(index_data)
     self.index = {}
     for entry in self.index_list:
         if self.index.get(entry.get('Package')) is None:
             self.index[entry.get('Package')] = []
         self.index[entry.get('Package')].append(entry)
     # sort package with same name with version lastest -> oldest
     for key in self.index:
         self.index[key] = sorted(
             self.index[key],
             cmp=lambda l, r: -1
             if version.parse(l.get('Version', '0.0')) >= version.parse(
                 r.get('Version', '0.0')) else 0)
Exemplo n.º 6
0
    def import_base_image(self, repository, tagprefix, url, digest):
        """Secure Docker base image importing.

        `docker pull` is not secure because it doesn't verify digests before
        processing data. Instead, it "tees" the image content to the image
        processing layer and the hasher and verifies the digest matches
        expected only after all image processing has occurred. While fast
        (images don't need to be buffered before being applied), it is insecure
        because a malicious image could exploit a bug in image processing
        and take control of the Docker daemon and your machine.

        This function takes a repository name, tag prefix, URL, and a SHA-256
        hex digest as arguments and returns the Docker image ID for the image.
        The contents of the image are, of course, verified to match the digest
        before being applied.

        The imported image is "tagged" in the repository specified. The tag of
        the created image is set to the specified prefix and the SHA-256 of a
        combination of the URL and digest. This serves as a deterministic cache
        key so subsequent requests for a (url, digest) can be returned nearly
        instantly. Of course, this assumes: a) the Docker daemon and its stored
        images can be trusted b) content of URLs is constant.
        """
        tag = '%s-%s' % (tagprefix,
                         hashlib.sha256('%s%s' % (url, digest)).hexdigest())
        for image in self._get_sorted_images():
            for repotag in image['RepoTags']:
                r, t = repotag.split(':')
                if r == repository and t == tag:
                    return image['Id']

        # We didn't get a cache hit. Download the URL.
        with tempfile.NamedTemporaryFile() as fh:
            digester = hashlib.sha256()
            res = requests.get(url, stream=True)
            for chunk in res.iter_content(8192):
                fh.write(chunk)
                digester.update(chunk)

            # Verify content before doing anything with it.
            # (This is the part Docker gets wrong.)
            if digester.hexdigest() != digest:
                raise Exception('downloaded Docker image does not match '
                                'digest:  %s; got %s expected %s'
                                % (url, digester.hexdigest(), digest))

            fh.flush()
            fh.seek(0)

            # Docker 1.10 no longer appears to allow import of .xz files
            # directly. Do the decompress locally.
            if url.endswith('.xz'):
                fh = lzma.decompress(fh.read())

            res = self.api_client.import_image_from_data(
                fh, repository=repository, tag=tag)
            # docker-py doesn't parse the JSON response in what is almost
            # certainly a bug. Do it ourselves.
            return json.loads(res.strip())['status']
Exemplo n.º 7
0
def _read_v2(filename):
    try:
        key = _bucket_v2.get_key(filename)
        compressed = key.get_contents_as_string()
        raw = lzma.decompress(compressed).split("\n")[:-1]
        return map(lambda x: x.split("\t", 1)[1], raw)
    except ssl.SSLError:
        return []
Exemplo n.º 8
0
def _read_v2(filename):
    try:
        key = _bucket_v2.get_key(filename)
        compressed = key.get_contents_as_string()
        raw = lzma.decompress(compressed).split("\n")[:-1]
        return map(lambda x: x.split("\t", 1)[1], raw)
    except ssl.SSLError:
        return []
Exemplo n.º 9
0
 def loads(astring):
     """Decompress and deserialize string into a Python object via pickle."""
     try:
         return pickle.loads(lzma.decompress(astring))
     except lzma.LZMAError as e:
         raise SerializerError('Cannot decompress object ("{}")'.format(
             str(e)))
     except pickle.UnpicklingError as e:
         raise SerializerError('Cannot restore object ("{}")'.format(
             str(e)))
Exemplo n.º 10
0
def main():
    import sys
    import os
    import tarfile
    if sys.version_info < (3, ):
        from backports import lzma
        import urllib2
    else:
        import lzma
        from urllib.request import FancyURLopener

        class MyURLOpener(FancyURLopener):
            version = 'Mozilla/5.0'

    try:
        nim_version_string = sys.argv[1]
    except IndexError:
        nim_version = (1, 2, 6)
        nim_version_string = '.'.join([str(x) for x in nim_version])
    nim_download = 'http://nim-lang.org/download/nim-{}.tar.xz'.format(
        nim_version_string)
    print('getting', nim_download)
    inst_dir = os.path.dirname(os.path.dirname(sys.executable))
    print('inst_dir', inst_dir)
    os.chdir(inst_dir)
    if True:
        from io import BytesIO
        if sys.version_info < (3, ):
            # request = urllib2.Request(nim_download)
            # request.add_header('User-Agent', "Mozilla/5.0")
            opener = urllib2.build_opener()
            opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
        else:
            opener = MyURLOpener()
        response = opener.open(nim_download)
        data = BytesIO()
        data.write(lzma.decompress(response.read()))
        data.seek(0)
        with tarfile.open(fileobj=data, mode='r') as tar:
            for tarinfo in tar:
                if '/' not in tarinfo.name:
                    continue
                name = tarinfo.name.split('/', 1)[1]
                if tarinfo.isdir():
                    if not os.path.exists(name):
                        os.mkdir(name)
                    continue
                # print('tarinfo', tarinfo.name, name, tarinfo.isdir())
                with open(name, 'wb') as fp:
                    fp.write(tar.extractfile(tarinfo).read())

    # os.system('make -j8')
    os.system('sh build.sh')
    os.system('./bin/nim c koch')
    os.system('./koch tools')
Exemplo n.º 11
0
 def loads(astring):
     """Decompress and deserialize string into a Python object via pickle."""
     try:
         return pickle.loads(lzma.decompress(astring))
     except lzma.LZMAError as e:
         raise SerializerError(
             'Cannot decompress object ("{}")'.format(str(e))
         )
     except pickle.UnpicklingError as e:
         raise SerializerError(
             'Cannot restore object ("{}")'.format(str(e))
         )
Exemplo n.º 12
0
  def run(self):
    while True:
      key = self.inqueue.get()

      k = Key(self.bucket)
      k.key=key
      keyContents = k.get_contents_as_string()
      lz = lzma.decompress(keyContents, format=lzma.FORMAT_ALONE)

      for line in lz.split('\n'):
        if line != '':
            self.outqueue.put(line)
      self.inqueue.task_done()
Exemplo n.º 13
0
  def run(self):
    while True:
      key = self.inqueue.get()

      k = Key(self.bucket)
      k.key=key
      keyContents = k.get_contents_as_string()
      lz = lzma.decompress(keyContents, format=lzma.FORMAT_ALONE)

      for line in lz.split('\n'):
        if line != '':
          j = json.loads(line[37:])
          if self.measure in j['histograms'].keys():
            histo = json.dumps(j['histograms'][self.measure])
            version = j['info']['appUpdateChannel']
            branch = j['info']['appVersion']
            self.outqueue.put(histo + ',' + version + ',' + branch)
      self.inqueue.task_done()
Exemplo n.º 14
0
    def open_db(self):
        self.terms_ldb = leveldb.LevelDB(self.terms_fl)
        self.docs_ldb = leveldb.LevelDB(self.docs_fl)

        self.doc_buffer_size = 0
        self.term_buffer_size = 0

        #self.doc_flush_buffer = np.empty(self.max_doc_flush_buffer_size, dtype="S%d" % self.max_doc_size)
        self.doc_flush_buffer = [None] * self.max_doc_flush_buffer_size
        self.term_flush_buffer = np.empty(self.max_term_flush_buffer_size,
                                          dtype="S%d" % self.max_term_size)
        self.doc_id_flush_buffer = np.empty(self.max_doc_flush_buffer_size,
                                            dtype=np.int64)
        self.term_id_flush_buffer = np.empty(self.max_term_flush_buffer_size,
                                             dtype=np.int64)

        if self.compression == COMPRESSION.NONE:
            self.compress = lambda string: string
            self.decompress = lambda string: string
        elif self.compression == COMPRESSION.ZLIB:
            import zlib
            self.compress = lambda string: zlib.compress(
                string, self.compression_level)
            self.decompress = lambda string: zlib.decompress(string)
        elif self.compression == COMPRESSION.LZMA:
            import backports.lzma as lzma
            self.compress = lambda string: lzma.compress(
                bytearray(string), format=lzma.FORMAT_RAW)
            self.decompress = lambda data: lzma.decompress(
                data, format=lzma.FORMAT_RAW)
        elif self.compression == COMPRESSION.LZ4R:
            import lz4
            self.compress = lambda string: lz4.compress(string)
            self.decompress = lambda string: lz4.decompress(string)
        elif self.compression == COMPRESSION.LZ4H:
            import lz4
            self.compress = lambda string: lz4.compressHC(string)
            self.decompress = lambda string: lz4.decompress(string)
        else:
            raise Exception("Wrong compression type %r" % self.compression)
Exemplo n.º 15
0
 def decompress(data, *args, **kwargs):
     return lzma.decompress(data, *args, **kwargs)
Exemplo n.º 16
0
 def get(self, n=1):
     res = self.call('post', '/q/%s/%d' % (self.queue, n))
     return cPickle.loads(lzma.decompress(res.content))
Exemplo n.º 17
0
    __doc__, usage='python compare-compression.py PATH-TO-.fc.xz')
parser.add_argument(
    'path', help='path to an XZ-compressed file to read for compression tests')
args = parser.parse_args()

## could check something about the FCs
#from dossier.fc.feature_collection import FeatureCollectionChunk as FCChunk
#fcc = FCChunk(path)
#for fc in fcc:
#    print fc['feature_name']

## assume the incoming data is some long-term archival stuff in XZ
## compressed format, and you want to see how slow XZ is:
xz_data = open(args.path).read()
start = time.time()
data = xz.decompress(xz_data)
decompression_time = time.time() - start

start = time.time()
xz_data2 = xz.compress(data)
assert xz_data2 == xz_data
compression_time = time.time() - start


def report(rec):
    rec['MB'] = rec['bytes'] / 2**20
    rec['ratio'] = rec['bytes'] / len(data)
    ctime = rec.get('compression_time')
    rec['compression_rate'] = ctime and rec['MB'] / ctime or float('inf')
    dtime = rec.get('decompression_time')
    rec['decompression_rate'] = dtime and rec['MB'] / dtime or 0
Exemplo n.º 18
0
def decrypt_and_uncompress(data, gpg_private=None, tmp_dir='/tmp'):
    '''
    Given a data buffer of bytes, if gpg_key_path is provided, decrypt
    data using gnupg, and uncompress using xz.
    
    :returns: a tuple of (logs, data), where `logs` is an array of
      strings and data is a binary string

    '''
    _errors = []
    tmp_path = os.path.join(tmp_dir, 'tmp-compress-and-encrypt-path-' + uuid.uuid4().hex)
    if not os.path.exists(tmp_path):
        os.makedirs(tmp_path)
    if gpg_private is not None:
        ### setup gpg for decryption
        gpg_dir = os.path.join(tmp_path, 'gpg_dir')
        os.makedirs(gpg_dir)

        gpg_child = subprocess.Popen(
            ['gpg', '--no-permission-warning', '--homedir', gpg_dir,
             '--import', gpg_private],
            stderr=subprocess.PIPE)
        s_out, errors = gpg_child.communicate()
        if errors:
            _errors.append('gpg logs to stderr, read carefully:\n\n%s' % errors)

        ## decrypt it, and free memory
        ## encrypt using the fingerprint for our trec-kba-rsa key pair
        gpg_child = subprocess.Popen(
            ## setup gpg to decrypt with trec-kba private key
            ## (i.e. make it the recipient), with zero compression,
            ## ascii armoring is off by default, and --output - must
            ## appear before --decrypt -
            ['gpg',   '--no-permission-warning', '--homedir', gpg_dir,
             '--trust-model', 'always', '--output', '-', '--decrypt', '-'],
            stdin =subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        ## communicate with child via its stdin 
        data, errors = gpg_child.communicate(data)
        if errors:
            _errors.append(errors)

        ## remove the gpg_dir
        shutil.rmtree(gpg_dir, ignore_errors=True)

    if lzma is not None:
        data = lzma.decompress(data)

    else:
        ## launch xz child
        xz_child = subprocess.Popen(
            ['xz', '--decompress'],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        ## use communicate to pass the data incrementally to the child
        ## while reading the output, to avoid blocking 
        data, errors = xz_child.communicate(data)

        assert not errors, errors

    return _errors, data
Exemplo n.º 19
0
 def _decode_lzma(self, string_data, shape, dtype):
   arr = lzma.decompress(string_data)
   arr = np.fromstring(arr, dtype=dtype)
   return arr.reshape(shape[::-1]).T
Exemplo n.º 20
0
parser = argparse.ArgumentParser(__doc__, usage='python compare-compression.py PATH-TO-.fc.xz')
parser.add_argument('path', help='path to an XZ-compressed file to read for compression tests')
args = parser.parse_args()

## could check something about the FCs
#from dossier.fc.feature_collection import FeatureCollectionChunk as FCChunk
#fcc = FCChunk(path)
#for fc in fcc:
#    print fc['feature_name']


## assume the incoming data is some long-term archival stuff in XZ
## compressed format, and you want to see how slow XZ is:
xz_data = open(args.path).read()
start = time.time()
data = xz.decompress(xz_data)
decompression_time = time.time() - start

start = time.time()
xz_data2 = xz.compress(data)
assert xz_data2 == xz_data
compression_time = time.time() - start

def report(rec):
    rec['MB'] = rec['bytes'] / 2**20
    rec['ratio'] = rec['bytes'] / len(data)
    ctime = rec.get('compression_time')
    rec['compression_rate'] = ctime and rec['MB'] / ctime or float('inf')
    dtime = rec.get('decompression_time')
    rec['decompression_rate'] = dtime and rec['MB'] / dtime or 0
    print('%(name)s:\t%(MB)d MB of FC data, %(ratio).3f compression, %(compression_time).3f seconds --> %(compression_rate).3f MB/sec compression, '