示例#1
0
def read_featherlz4(pathstr):
    outfilename = os.path.join(tempfile.gettempdir(), '{}'.format(time.time()))
    with open(pathstr, 'rb') as file_in, open(outfilename, 'wb') as file_out:
        for chunk in lz4framed.Decompressor(file_in):
            file_out.write(chunk)
    df = feather.read_feather(outfilename,
                              nthreads=multiprocessing.cpu_count())
    os.remove(outfilename)
    return df
示例#2
0
    def generator():
        try:
            url = urljoin(current_app.config["AUTOCLAVED_BASE_URL"],
                          autoclaved_filename)
            # byte positions specified are inclusive -- https://tools.ietf.org/html/rfc7233#section-2.1
            headers = {
                "Range":
                "bytes={}-{}".format(frame_off,
                                     frame_off + total_frame_size - 1),
                REQID_HDR:
                request_id(),
            }
            r = requests.get(url, headers=headers, stream=True)
            r.raise_for_status()
            beginning = True
            # Create a copy because we are in a closure
            to_read = report_size
            while to_read > 1:
                for d in lz4framed.Decompressor(r.raw):
                    if beginning and intra_off > 0:
                        d = d[intra_off:]
                    if len(d) > to_read:
                        d = d[:to_read]

                    # Sanity checks to ensure the streamed data start with
                    # `{` and ends with `\n`
                    if beginning and d[:1] != b"{":
                        raise HTTPException("Chunk starts with %r != {" %
                                            d[:1])
                    if to_read == len(d) and d[-1:] != b"\n":
                        raise HTTPException("Chunk ends with %r != \\n" %
                                            d[-1:])

                    yield d
                    to_read -= len(d)
                    if len(d):  # valid lz4 frame may have 0 bytes
                        beginning = False
            # `autoclaved` file format may have `\n` in separate LZ4 frame,
            # database stores offset for JSON blobs without trailing newline,
            # here is hack adding newline as next frame boundaries are unknown.
            if r.raw.read(1) != b"":  # stream must be already EOFed
                raise HTTPException("Unprocessed LZ4 data left")
            if to_read == 1:
                yield b"\n"
        except Exception as exc:
            raise HTTPException("Failed to fetch data: %s" % exc)
示例#3
0
def receive(sock):

    unpickler = pickle.Unpickler(sock)
    result = OrderedDict([])
    keylist = unpickler.load()

    for col in keylist:
        data = ''
        try:
            for chunk in lz4framed.Decompressor(sock):
                data += chunk.decode('utf-8')
        except lz4framed.Lz4FramedNoDataError:
            pass
        except EOFError:
            pass
        result[col] = json.loads(data)

    return result
示例#4
0
def receive(sock):

    unpickler = pickle.Unpickler(sock)
    result = OrderedDict([])
    keylist = unpickler.load()

    for col in keylist:
        colz = b''
        for chunk in lz4framed.Decompressor(sock):
            try:
                colz += chunk
            except lz4framed.Lz4FramedNoDataError:
                pass
            except EOFError:
                pass
        result[col] = pickle.loads(colz)

    return result
示例#5
0
    def __iter__(self):
        if self._f is None:
            raise RuntimeError('lz4 file has not been opened')

        decoded = b''
        for chunk in lz4framed.Decompressor(self._f):
            decoded += chunk
            decoded = decoded.split(b'\n')
            for data in decoded[:-1]:
                if self.decode:
                    yield data.decode() + '\n'
                else:
                    yield data + b'\n'

            decoded = decoded[-1]

        if decoded:
            yield decoded.decode()
    def process_iobj(self, iobj):
        """
        Processing
        :param iobj: 
        :return: 
        """
        input_name = self.iobj_name(iobj)
        logger.info('Processing: %s' % input_name)

        finish_file = self.get_finish_file(input_name)
        if os.path.exists(finish_file):
            logger.info('Finish indicator file exists, skipping: %s' %
                        finish_file)
            return

        self.cur_decompressor = None
        self.cur_state_file = self.get_state_file(input_name)
        file_leafs = self.get_classification_leafs(input_name)
        file_roots = self.get_classification_roots(input_name)
        self.last_record_resumed = None

        self.processor = newline_reader.NewlineReader(is_json=False)
        handle = iobj
        name = str(iobj)

        if name.endswith('lz4'):
            self.cur_decompressor = lz4framed.Decompressor(handle)
            handle = self.cur_decompressor

        if not self.is_dry() and (not self.args.continue1
                                  or not os.path.exists(file_leafs)
                                  or not os.path.exists(file_roots)):
            utils.safely_remove(file_leafs)
            utils.safely_remove(file_roots)
            self.file_leafs_fh = utils.safe_open(file_leafs,
                                                 mode='w',
                                                 chmod=0o644)
            self.file_roots_fh = utils.safe_open(file_roots,
                                                 mode='w',
                                                 chmod=0o644)

        elif self.args.continue1:
            logger.info('Continuing with the started files')
            self.file_leafs_fh = open(file_leafs,
                                      mode='r+' if not self.is_dry() else 'r')
            self.file_roots_fh = open(file_roots,
                                      mode='r+' if not self.is_dry() else 'r')
            self.restore_checkpoint(iobj)
            self.continue_leafs(file_leafs)

        with iobj:
            resume_token_found = False
            resume_token = None
            resume_idx = 0
            record_ctr = -1
            already_processed = 0
            read_start = self.read_data
            for idx, record in self.processor.process(handle):
                try:
                    record_ctr += 1
                    self.read_data += len(record)

                    # Check the checkpoint distance + boundary - process all newline chunks available
                    if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element:
                        logger.info(
                            '...progress: %s GB, idx: %s, pos: %s GB, '
                            'found: %s, mem: %04.8f MB, readpos: %s (%4.6f GB)'
                            % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx,
                               self.read_data, self.num_found,
                               utils.get_mem_usage() / 1024.0, iobj.tell(),
                               iobj.tell() / 1024.0 / 1024.0 / 1024.0))

                        self.last_report = self.read_data
                        self.try_store_checkpoint(iobj=iobj,
                                                  idx=idx,
                                                  resume_idx=resume_idx,
                                                  resume_token=resume_token)

                        # Flush already seen IP database, not needed anymore
                        # we are too far from the resumed checkpoint
                        if read_start + 1024 * 1024 * 1024 * 2 > self.read_data:
                            self.state_loaded_ips = set()

                    js = json.loads(record)
                    self.process_record(idx, js)

                except Exception as e:
                    logger.error('Exception in processing %d: %s' %
                                 (self.ctr, e))
                    logger.debug(traceback.format_exc())
                    logger.debug(record)

                self.ctr += 1

            logger.info('Total: %d' % self.ctr)
            logger.info('Total_chain: %d' % self.chain_ctr)
            logger.info('Not tls: %d' % self.not_tls)
            logger.info('Not cert ok: %d' % self.not_cert_ok)
            logger.info('Not chain ok: %d' % self.not_chain_ok)
            logger.info('Not parsed: %d' % self.not_parsed)
            logger.info('Not rsa: %d' % self.not_rsa)

        logger.info('Processed: %s' % iobj)
        if not self.is_dry():
            self.file_leafs_fh.close()
            self.file_roots_fh.close()
            utils.try_touch(finish_file)
示例#7
0
    def process(self, iobj):
        """
        Process input object - read LZ4, produce metadata
        :param iobj: 
        :return: 
        """
        input_name = self.iobj_name(iobj)
        logger.info('Processing: %s' % input_name)

        finish_file = self.get_finish_file(input_name)
        if os.path.exists(finish_file):
            logger.info('Finish indicator file exists, skipping: %s' %
                        finish_file)
            return

        self.cur_decompressor = None
        self.cur_state_file = self.get_state_file(input_name)
        self.processor = newline_reader.NewlineReader(is_json=False)
        if self.args.copy_dir is not None:
            copy_path = os.path.join(self.args.copy_dir, input_name)
            logger.info('Going to create a copy to %s' % copy_path)
            self.cur_copy_fh = open(copy_path, 'w')

        handle = iobj
        name = str(iobj)

        if self.cur_copy_fh is not None:
            handle = input_obj.TeeInputObject(parent_fh=handle,
                                              copy_fh=self.cur_copy_fh)

        if name.endswith('lz4'):
            self.cur_decompressor = lz4framed.Decompressor(handle)
            handle = self.cur_decompressor

        if self.args.continue1:
            logger.info('Continuing with the started files')
            self.restore_checkpoint(iobj)

        with iobj:
            record_ctr = -1
            read_start = self.read_data
            for idx, record in self.processor.process(handle):
                try:
                    record_ctr += 1
                    self.read_data += len(record)

                    # Check the checkpoint distance + boundary - process all newline chunks available
                    if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element:

                        logger.info(
                            '...progress: %s GB, idx: %s, pos: %s GB, mem: %04.8f MB, readpos: %s (%4.6f GB)'
                            % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx,
                               self.read_data, utils.get_mem_usage() / 1024.0,
                               iobj.tell(),
                               iobj.tell() / 1024.0 / 1024.0 / 1024.0))

                        self.last_report = self.read_data
                        self.try_store_checkpoint(iobj=iobj, idx=idx)

                        # Flush already seen IP database, not needed anymore
                        # we are too far from the resumed checkpoint
                        if read_start + 1024 * 1024 * 1024 * 2 > self.read_data:
                            self.state_loaded_ips = set()

                except Exception as e:
                    logger.error('Exception in processing %d: %s' %
                                 (self.ctr, e))
                    logger.debug(traceback.format_exc())

                self.ctr += 1

        logger.info('Processed: %s' % iobj)

        if self.cur_copy_fh is not None:
            self.cur_copy_fh.close()
        utils.try_touch(finish_file)