def read_featherlz4(pathstr): outfilename = os.path.join(tempfile.gettempdir(), '{}'.format(time.time())) with open(pathstr, 'rb') as file_in, open(outfilename, 'wb') as file_out: for chunk in lz4framed.Decompressor(file_in): file_out.write(chunk) df = feather.read_feather(outfilename, nthreads=multiprocessing.cpu_count()) os.remove(outfilename) return df
def generator(): try: url = urljoin(current_app.config["AUTOCLAVED_BASE_URL"], autoclaved_filename) # byte positions specified are inclusive -- https://tools.ietf.org/html/rfc7233#section-2.1 headers = { "Range": "bytes={}-{}".format(frame_off, frame_off + total_frame_size - 1), REQID_HDR: request_id(), } r = requests.get(url, headers=headers, stream=True) r.raise_for_status() beginning = True # Create a copy because we are in a closure to_read = report_size while to_read > 1: for d in lz4framed.Decompressor(r.raw): if beginning and intra_off > 0: d = d[intra_off:] if len(d) > to_read: d = d[:to_read] # Sanity checks to ensure the streamed data start with # `{` and ends with `\n` if beginning and d[:1] != b"{": raise HTTPException("Chunk starts with %r != {" % d[:1]) if to_read == len(d) and d[-1:] != b"\n": raise HTTPException("Chunk ends with %r != \\n" % d[-1:]) yield d to_read -= len(d) if len(d): # valid lz4 frame may have 0 bytes beginning = False # `autoclaved` file format may have `\n` in separate LZ4 frame, # database stores offset for JSON blobs without trailing newline, # here is hack adding newline as next frame boundaries are unknown. if r.raw.read(1) != b"": # stream must be already EOFed raise HTTPException("Unprocessed LZ4 data left") if to_read == 1: yield b"\n" except Exception as exc: raise HTTPException("Failed to fetch data: %s" % exc)
def receive(sock): unpickler = pickle.Unpickler(sock) result = OrderedDict([]) keylist = unpickler.load() for col in keylist: data = '' try: for chunk in lz4framed.Decompressor(sock): data += chunk.decode('utf-8') except lz4framed.Lz4FramedNoDataError: pass except EOFError: pass result[col] = json.loads(data) return result
def receive(sock): unpickler = pickle.Unpickler(sock) result = OrderedDict([]) keylist = unpickler.load() for col in keylist: colz = b'' for chunk in lz4framed.Decompressor(sock): try: colz += chunk except lz4framed.Lz4FramedNoDataError: pass except EOFError: pass result[col] = pickle.loads(colz) return result
def __iter__(self): if self._f is None: raise RuntimeError('lz4 file has not been opened') decoded = b'' for chunk in lz4framed.Decompressor(self._f): decoded += chunk decoded = decoded.split(b'\n') for data in decoded[:-1]: if self.decode: yield data.decode() + '\n' else: yield data + b'\n' decoded = decoded[-1] if decoded: yield decoded.decode()
def process_iobj(self, iobj): """ Processing :param iobj: :return: """ input_name = self.iobj_name(iobj) logger.info('Processing: %s' % input_name) finish_file = self.get_finish_file(input_name) if os.path.exists(finish_file): logger.info('Finish indicator file exists, skipping: %s' % finish_file) return self.cur_decompressor = None self.cur_state_file = self.get_state_file(input_name) file_leafs = self.get_classification_leafs(input_name) file_roots = self.get_classification_roots(input_name) self.last_record_resumed = None self.processor = newline_reader.NewlineReader(is_json=False) handle = iobj name = str(iobj) if name.endswith('lz4'): self.cur_decompressor = lz4framed.Decompressor(handle) handle = self.cur_decompressor if not self.is_dry() and (not self.args.continue1 or not os.path.exists(file_leafs) or not os.path.exists(file_roots)): utils.safely_remove(file_leafs) utils.safely_remove(file_roots) self.file_leafs_fh = utils.safe_open(file_leafs, mode='w', chmod=0o644) self.file_roots_fh = utils.safe_open(file_roots, mode='w', chmod=0o644) elif self.args.continue1: logger.info('Continuing with the started files') self.file_leafs_fh = open(file_leafs, mode='r+' if not self.is_dry() else 'r') self.file_roots_fh = open(file_roots, mode='r+' if not self.is_dry() else 'r') self.restore_checkpoint(iobj) self.continue_leafs(file_leafs) with iobj: resume_token_found = False resume_token = None resume_idx = 0 record_ctr = -1 already_processed = 0 read_start = self.read_data for idx, record in self.processor.process(handle): try: record_ctr += 1 self.read_data += len(record) # Check the checkpoint distance + boundary - process all newline chunks available if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element: logger.info( '...progress: %s GB, idx: %s, pos: %s GB, ' 'found: %s, mem: %04.8f MB, readpos: %s (%4.6f GB)' % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx, self.read_data, self.num_found, utils.get_mem_usage() / 1024.0, iobj.tell(), iobj.tell() / 1024.0 / 1024.0 / 1024.0)) self.last_report = self.read_data self.try_store_checkpoint(iobj=iobj, idx=idx, resume_idx=resume_idx, resume_token=resume_token) # Flush already seen IP database, not needed anymore # we are too far from the resumed checkpoint if read_start + 1024 * 1024 * 1024 * 2 > self.read_data: self.state_loaded_ips = set() js = json.loads(record) self.process_record(idx, js) except Exception as e: logger.error('Exception in processing %d: %s' % (self.ctr, e)) logger.debug(traceback.format_exc()) logger.debug(record) self.ctr += 1 logger.info('Total: %d' % self.ctr) logger.info('Total_chain: %d' % self.chain_ctr) logger.info('Not tls: %d' % self.not_tls) logger.info('Not cert ok: %d' % self.not_cert_ok) logger.info('Not chain ok: %d' % self.not_chain_ok) logger.info('Not parsed: %d' % self.not_parsed) logger.info('Not rsa: %d' % self.not_rsa) logger.info('Processed: %s' % iobj) if not self.is_dry(): self.file_leafs_fh.close() self.file_roots_fh.close() utils.try_touch(finish_file)
def process(self, iobj): """ Process input object - read LZ4, produce metadata :param iobj: :return: """ input_name = self.iobj_name(iobj) logger.info('Processing: %s' % input_name) finish_file = self.get_finish_file(input_name) if os.path.exists(finish_file): logger.info('Finish indicator file exists, skipping: %s' % finish_file) return self.cur_decompressor = None self.cur_state_file = self.get_state_file(input_name) self.processor = newline_reader.NewlineReader(is_json=False) if self.args.copy_dir is not None: copy_path = os.path.join(self.args.copy_dir, input_name) logger.info('Going to create a copy to %s' % copy_path) self.cur_copy_fh = open(copy_path, 'w') handle = iobj name = str(iobj) if self.cur_copy_fh is not None: handle = input_obj.TeeInputObject(parent_fh=handle, copy_fh=self.cur_copy_fh) if name.endswith('lz4'): self.cur_decompressor = lz4framed.Decompressor(handle) handle = self.cur_decompressor if self.args.continue1: logger.info('Continuing with the started files') self.restore_checkpoint(iobj) with iobj: record_ctr = -1 read_start = self.read_data for idx, record in self.processor.process(handle): try: record_ctr += 1 self.read_data += len(record) # Check the checkpoint distance + boundary - process all newline chunks available if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element: logger.info( '...progress: %s GB, idx: %s, pos: %s GB, mem: %04.8f MB, readpos: %s (%4.6f GB)' % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx, self.read_data, utils.get_mem_usage() / 1024.0, iobj.tell(), iobj.tell() / 1024.0 / 1024.0 / 1024.0)) self.last_report = self.read_data self.try_store_checkpoint(iobj=iobj, idx=idx) # Flush already seen IP database, not needed anymore # we are too far from the resumed checkpoint if read_start + 1024 * 1024 * 1024 * 2 > self.read_data: self.state_loaded_ips = set() except Exception as e: logger.error('Exception in processing %d: %s' % (self.ctr, e)) logger.debug(traceback.format_exc()) self.ctr += 1 logger.info('Processed: %s' % iobj) if self.cur_copy_fh is not None: self.cur_copy_fh.close() utils.try_touch(finish_file)