def dataentry(self): self.toaster.msgblockbegin("writing to temporary file") f_tmp = TemporaryFile() try: total_padding = self.data.write(f_tmp) # comparing the files will usually be different because blocks may # have been written back in a different order, so cheaply just compare # file sizes self.toaster.msg("comparing file sizes") self.stream.seek(0, 2) f_tmp.seek(0, 2) if self.stream.tell() != f_tmp.tell(): self.toaster.msg("original size: %i" % self.stream.tell()) self.toaster.msg("written size: %i" % f_tmp.tell()) self.toaster.msg("padding: %i" % total_padding) if self.stream.tell() > f_tmp.tell() or self.stream.tell() + total_padding < f_tmp.tell(): f_tmp.seek(0) f_debug = open("debug.cgf", "wb") f_debug.write(f_tmp.read(-1)) f_debug.close() raise Exception('write check failed: file sizes differ by more than padding') finally: f_tmp.close() self.toaster.msgblockend() # spell is finished: prevent recursing into the tree return False
def dataentry(self): self.toaster.msgblockbegin("writing to temporary file") f_tmp = TemporaryFile() try: total_padding = self.data.write(f_tmp) # comparing the files will usually be different because blocks may # have been written back in a different order, so cheaply just compare # file sizes self.toaster.msg("comparing file sizes") self.stream.seek(0, 2) f_tmp.seek(0, 2) if self.stream.tell() != f_tmp.tell(): self.toaster.msg("original size: %i" % self.stream.tell()) self.toaster.msg("written size: %i" % f_tmp.tell()) self.toaster.msg("padding: %i" % total_padding) if self.stream.tell() > f_tmp.tell( ) or self.stream.tell() + total_padding < f_tmp.tell(): f_tmp.seek(0) f_debug = open("debug.cgf", "wb") f_debug.write(f_tmp.read(-1)) f_debug.close() raise Exception( 'write check failed: file sizes differ by more than padding' ) finally: f_tmp.close() self.toaster.msgblockend() # spell is finished: prevent recursing into the tree return False
def test_execute_non_blocking(self): self.sandbox = Sandbox() output = TemporaryFile(mode="w+b") start_time = perf_counter() self.sandbox.execute(command="sleep 0.2 ; echo foo", stdin_fd=None, stdout_fd=output, stderr_fd=None, blocking=False) self.assertLess(perf_counter() - start_time, 0.1) self.assertEqual(output.tell(), 0) # Haven't yet printed anything sleep(0.3) self.assertEqual(output.tell(), 4) # But printing it eventually
class T(threading.Thread): _shutdown_msg = "shutdown" def __init__(self): threading.Thread.__init__(self) self._fd = TemporaryFile() self._comm_fd = TemporaryFile() self._run = False def get_file_handle(self): return self._fd def run(self): self._run = True while self._run: t1 = time.time() r, _, _ = select.select([self._fd.fileno(), self._comm_fd.fileno()], [], []) print "select time:", time.time()-t1 for elem in r: if elem == self._fd.fileno(): s = self._fd.tell() self._fd.seek(0, os.SEEK_END) # to the end e = self._fd.tell() if s == e: # nothing new continue self._fd.seek(-(e-s), os.SEEK_END) diff = self._fd.read(e-s) if True: sys.stdout.write(diff) sys.stdout.flush() # exit elif elem == self._comm_fd.fileno(): self._comm_fd.seek(0, os.SEEK_END) if self._comm_fd.tell() == len(T._shutdown_msg): self._run = False self._comm_fd.write(T._shutdown_msg) self._comm_fd.flush() def stop(self): self._comm_fd.seek(0, os.SEEK_END) if self._comm_fd.tell() != 0: return self._comm_fd.write(T._shutdown_msg) self._comm_fd.flush() while self._comm_fd.tell() != 2*len(T._shutdown_msg): self._comm_fd.seek(0, os.SEEK_END) def __del__(self, ): self._fd.close()
def books(self, oncard=False, end_session=True): """ Return a list of ebooks on the device. @param oncard: If True return a list of ebooks on the storage card, otherwise return list of ebooks in main memory of device @return: L{BookList} """ root = "/Data/media/" tfile = TemporaryFile() if oncard: try: self.get_file("a:" + self.CACHE_XML, tfile, end_session=False) root = "a:/" except PathError: try: self.get_file("b:" + self.CACHE_XML, tfile, end_session=False) root = "b:/" except PathError: pass if tfile.tell() == 0: tfile = None else: self.get_file(self.MEDIA_XML, tfile, end_session=False) bl = BookList(root=root, sfile=tfile) paths = bl.purge_corrupted_files() for path in paths: try: self.del_file(path, end_session=False) except PathError: # Incase this is a refetch without a sync in between continue return bl
def write_lines(self, key, lines): self._verify_key_format(key) storage = self.bucket.new_key(key + ".json.gz") buff = TemporaryFile() archive = gzip.GzipFile(fileobj=buff, mode='w') count = 0 for l in lines: if hasattr(l, "__iter__"): for ll in l: archive.write(ll.encode("utf8")) archive.write(b"\n") count += 1 else: archive.write(l.encode("utf8")) archive.write(b"\n") count += 1 archive.close() file_length = buff.tell() retry = 3 while retry: try: with Timer("Sending {{count}} lines in {{file_length|comma}} bytes", {"file_length": file_length, "count": count}, debug=self.settings.debug): buff.seek(0) storage.set_contents_from_file(buff) break except Exception, e: Log.warning("could not push data to s3", cause=e) retry -= 1
class CandidateUploadFile(BaseHandler): def initialize(self): self.tempfile = TemporaryFile() @tornado.web.authenticated @granted() @tornado.web.asynchronous def post(self): fp_url = self.get_argument("url") mime_type = self.get_argument("data[type]") size = int(self.get_argument("data[size]")) candidate_id = self.get_argument("id") self.candidate = self.db.query(Candidate).get(int(candidate_id)) logging.info("type: %s, size: %r", mime_type, size) if mime_type == "image/jpeg" and size < MAX_UPLOAD_SIZE: http_client = tornado.httpclient.AsyncHTTPClient() request = tornado.httpclient.HTTPRequest(url=fp_url, streaming_callback=self.streaming_callback) http_client.fetch(request, self.on_download) else: self.finish(dict(status=0)) def streaming_callback(self, data): self.tempfile.write(data) logging.info("This is the streaming_callback file tell function: %r", self.tempfile.tell()) def on_download(self, response): img_path = os.path.join(os.path.dirname(__file__), "static/profiles/img/" + str(self.candidate.backup_id) + '.jpg') self.tempfile.seek(0) ptr = open(img_path, 'wb') ptr.write(self.tempfile.read()) ptr.close() self.tempfile.close() self.finish(dict(src="/static/profiles/img/" + str(self.candidate.backup_id) + '.jpg', status=1))
def output(x, y, value): global img, index pixel = index // 3 x = x #pixel % W y = y #pixel // W rgbindex = index % 3 pixelvalue = list(img.getpixel((x, y))) pixelvalue[rgbindex] = value pixelvalue = tuple(pixelvalue) colorcounter[str((x, y))] += 1 #if rgbindex == 2: # colorcounter[pixelvalue] += 1 img.putpixel((x, y), pixelvalue) index += 1 if index // 3 >= W * H: tmpfile = TemporaryFile() img.save(tmpfile, "png") compressed_imgbytes = tmpfile.tell() imgbytes = W * H * 3 ratio = compressed_imgbytes / imgbytes print("Result .png compression ratio:", ratio) img = img.resize((W * SCALE, H * SCALE)) img.save(IMAGEDIR + "/" + str(int(time() * 1000)) + ".png") exit(0)
def retr(self, path, retry=3): """ Извлечение файла во временный :param path: :param retry: :return: """ tmp = TemporaryFile() try: size = self.ftp.size(path) self.ftp.retrbinary('RETR ' + path, tmp.write) except Exception: if retry > 0: tmp.close() # Рекурсивно вызываем retry попыток return self.retr(path, retry - 1) else: tmp.close() return None # Сравниваем размер созданного временного файла с размером оригинального файла if size != tmp.tell(): if retry > 0: tmp.close() return self.retr(path, retry - 1) else: tmp.close() return None return tmp
def read_file(self, data): temp_file = TemporaryFile(mode="w+b") if "content-length" in self.current_headers: temp_file.write(data.read(self.current_headers["content-length"])) else: bytes = data.readline() while not bytes[-2:] == "\r\n": temp_file.write(bytes) bytes = data.readline() temp_file.write(bytes.rstrip()) filesize = temp_file.tell() if filesize == 0: self.read_boundry(data) return key = self.current_headers["content-disposition"]["name"] filename = self.current_headers["content-disposition"].get("filename", "") content_type = self.current_headers["content-type"] if key not in self.files: self.files[key] = [] temp_file.seek(0) self.files[key].append({"filename":filename, "filesize":filesize, "content-type":content_type, "data":temp_file}) self.read_boundry(data)
def index_html (self, icon=0, preview=0, width=None, height=None, REQUEST=None): """ Return the file with it's corresponding MIME-type """ if REQUEST is not None: if self._if_modified_since_request_handler(REQUEST): self.ZCacheable_set(None) return '' if self._redirect_default_view_request_handler(icon, preview, REQUEST): return '' filename, content_type, icon, preview = self._get_file_to_serve(icon, preview) filename = self._get_fsname(filename) if _debug > 1: logger.info('serving %s, %s, %s, %s' %(filename, content_type, icon, preview)) if filename: size = os.stat(filename)[6] else: filename = self._get_icon_file(broken=True) size = os.stat(filename)[6] content_type = 'image/gif' icon = 1 if icon==0 and width is not None and height is not None: data = TemporaryFile() # hold resized image try: from PIL import Image im = Image.open(filename) if im.mode!='RGB': im = im.convert('RGB') filter = Image.BICUBIC if hasattr(Image, 'ANTIALIAS'): # PIL 1.1.3 filter = Image.ANTIALIAS im = im.resize((int(width),int(height)), filter) im.save(data, 'JPEG', quality=85) except: data = open(filename, 'rb') else: data.seek(0,2) size = data.tell() data.seek(0) content_type = 'image/jpeg' else: data = open(filename, 'rb') if REQUEST is not None: last_mod = rfc1123_date(self._p_mtime) REQUEST.RESPONSE.setHeader('Last-Modified', last_mod) REQUEST.RESPONSE.setHeader('Content-Type', content_type) REQUEST.RESPONSE.setHeader('Content-Length', size) self.ZCacheable_set(None) return stream_iterator(data) try: return data.read() finally: data.close()
def savefile(fd,fname,bfirmid,bclientid): # Encrypt each chunk from fd as it is read into a # tmpfile which will be uploaded to Dropbox using # the given filename. r = requests.get("%s/keyserv/key/%s/%s" % (app.config['KEYSERVER_URI'],bfirmid,bclientid)) print "%s/keyserv/key/%s/%s" % (app.config['KEYSERVER_URI'],bfirmid,bclientid) keyobj = r.json() encrkey = keyobj['key'] print "Got key %s" % encrkey # Carve out a 32byte/256 bit key from the keyserver # but convert base64 back to binary first bkey = binascii.a2b_base64(encrkey) key = bkey[0:32] try: print "Starting encryption" # Setup our AES cipher iv = Random.new().read(AES.block_size) cipher = AES.new(key,AES.MODE_CFB,iv) #cipher = XORCipher.new(key) print "Cipher created using iv %s" % binascii.hexlify(iv) except: raise try: f = TemporaryFile() f.write(iv) for chunk in chunkfd(fd,blocksize=4194304): f.write(cipher.encrypt(chunk)) f.flush() f.seek(0,os.SEEK_END) fsize = f.tell() f.seek(0) except Exception as e: print e print "Getting ready for Dropbox upload" # Get a Dropbox uploader try: access_token = config.get('Credentials','access_token') dclient = DropboxClient(access_token) uploader = dclient.get_chunked_uploader(f,fsize) while uploader.offset < fsize: try: upload = uploader.upload_chunked() except Exception as e: print e except Exception as e: print e f.close() return uploader.finish(secure_filename("/%s_encr" % fname))
def thumb_img(img, width=None, height=None, name='thumb.jpg'): io = TemporaryFile() thumb = img.copy() thumb.thumbnail(image_width_height(img, width, height), Image.ANTIALIAS) thumb.save(io, format='JPEG', quality=100) del thumb size = io.tell() io.seek(0) return InMemoryUploadedFile(io, None, name, 'image/jpeg', size, None)
def exportContentInTempFile(self, context, obj_paths=None, filename=None): """ Export content to a zip file. """ objects_list = self._createObjectList(context, obj_paths) tfile = TemporaryFile() self._getAllObjectsData(context, objects_list, tfile) size = tfile.tell() tfile.seek(0) return tfile, size
class GmlZBuffer(io.IOBase): ''' ''' def __init__(self): self.buffer = StringIO() self.compressor = zlib.compressobj(6, ZIP_DEFLATED, -9) self.overflow = False self.crc = 0 self.uncompressed_size = 0 self.compressed_size = 0 self.compressed_chunk_mark = 0 def write(self, data): self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.uncompressed_size += len(data) compressed_data = self.compressor.compress(data) if not (compressed_data and len(compressed_data)): return 0 compressed_size = len(compressed_data) self._prepare_write(compressed_size) self.buffer.write(compressed_data) self.compressed_size += compressed_size return compressed_size def _prepare_write(self, length): if self.overflow: return if self.buffer.tell() + length > _GMLZBUFFER_MAX_SIZE: data = self.buffer.getvalue() self.buffer.close() self.buffer = TemporaryFile() self.buffer.write(data) self.overflow = True def flush(self): prev_mark = self.mark compressed_data = self.compressor.flush(zlib.Z_FULL_FLUSH) compressed_size = len(compressed_data) self._prepare_write(compressed_size) self.buffer.write(compressed_data) self.compressed_size += compressed_size self.mark = self.buffer.tell() return prev_mark, self.mark
def test_execute_blocking(self): self.sandbox = Sandbox() output = TemporaryFile(mode="w+b") start_time = perf_counter() self.sandbox.execute(command="sleep 0.2 ; echo foo", stdin_fd=None, stdout_fd=output, stderr_fd=None, blocking=True) self.assertGreaterEqual(perf_counter() - start_time, 0.2) self.assertEqual(output.tell(), 4) # Already printed "foo\n"
def handleExport(self, action): data, errors = self.extractData() if errors: self.status = self.formErrorsMessage return if data['paths']: objs = data['paths'] else: objs = [self.context] message = model.MessageT1() message.action = data['action'] message.recipient_id = data['recipients'] if data['subject'] is not None: message.subjects = [data['subject']] if data['comment'] is not None: message.comments = [data['comment']] if data['directive']: directive = model.Directive(data['directive']) directive.priority = data['priority'] directive.deadline = data['deadline'] message.directive = directive journal_entry = _(u'label_exported_as_ech0147', default=u'Exported as eCH-0147 message') for obj in objs: message.add_object(obj) journal_entry_factory(obj, 'eCH-0147 Export', journal_entry) header_dom = message.header().toDOM(element_name='eCH-0147T0:header') message_dom = message.binding().toDOM() tmpfile = TemporaryFile() with ZipFile(tmpfile, 'w', ZIP_DEFLATED, True) as zipfile: zipfile.writestr( 'header.xml', header_dom.toprettyxml(encoding='UTF-8')) zipfile.writestr( 'message.xml', message_dom.toprettyxml(encoding='UTF-8')) message.add_to_zip(zipfile) size = tmpfile.tell() response = self.request.response response.setHeader( "Content-Disposition", 'inline; filename="message.zip"') response.setHeader("Content-type", "application/zip") response.setHeader("Content-Length", size) self.response_body = TempfileStreamIterator(tmpfile, size)
def check_requirements(): "Check requirements" output = TemporaryFile(mode='rwt') pos = 0 for req in REQUIREMENTS: if 0 != call(['which', req], stdout=output, stderr=output): # get call output output.seek(pos) err = output.read() print "ERROR: %s is not satisfied (%s)" % (req, err) sys.exit(1) pos = output.tell()
def test_dup_stdout(selenium): # Test redirecting stdout using low level os.dup operations. # This sort of redirection is used in pytest. import os import sys from tempfile import TemporaryFile tf = TemporaryFile(buffering=0) save_stdout = os.dup(sys.stdout.fileno()) os.dup2(tf.fileno(), sys.stdout.fileno()) print("hi!!") print("there...") assert tf.tell() == len("hi!!\nthere...\n") os.dup2(save_stdout, sys.stdout.fileno()) print("not captured") os.dup2(tf.fileno(), sys.stdout.fileno()) print("captured") assert tf.tell() == len("hi!!\nthere...\ncaptured\n") os.dup2(save_stdout, sys.stdout.fileno()) os.close(save_stdout) tf.seek(0) assert tf.read(1000).decode() == "hi!!\nthere...\ncaptured\n"
class BadBoyResponseFilter: def __init__(self, client): if not os.path.exists(BAD_CONTENT_TMP_DIR) : try: os.makedirs(BAD_CONTENT_TMP_DIR) except: pass self.fd_orig = TemporaryFile(mode='rw+b', dir=BAD_CONTENT_TMP_DIR) self.fd_filtered = TemporaryFile(mode='rw+b', dir=BAD_CONTENT_TMP_DIR) self.client = client def feed(self, data): self.fd_orig.write(data) def filter(self): pass def send_response(self): self.fd_orig.seek(0) self.filter() self.client.father.transport.write(self.client.bb_status) for key,value in self.client.bb_headers : if key.lower() == "content-length" : value = self.fd_filtered.tell() self.client.father.transport.write("%s: %s\r\n" % (key, value)) self.client.father.transport.write("\r\n") file_len = self.fd_filtered.tell() self.fd_filtered.seek(0) while self.fd_filtered.tell() < file_len : self.client.father.transport.write(self.fd_filtered.read(1024)) self.fd_orig.close() self.fd_filtered.close()
class BadBoyResponseFilter: def __init__(self, client): if not os.path.exists(BAD_CONTENT_TMP_DIR): try: os.makedirs(BAD_CONTENT_TMP_DIR) except: pass self.fd_orig = TemporaryFile(mode='rw+b', dir=BAD_CONTENT_TMP_DIR) self.fd_filtered = TemporaryFile(mode='rw+b', dir=BAD_CONTENT_TMP_DIR) self.client = client def feed(self, data): self.fd_orig.write(data) def filter(self): pass def send_response(self): self.fd_orig.seek(0) self.filter() self.client.father.transport.write(self.client.bb_status) for key, value in self.client.bb_headers: if key.lower() == "content-length": value = self.fd_filtered.tell() self.client.father.transport.write("%s: %s\r\n" % (key, value)) self.client.father.transport.write("\r\n") file_len = self.fd_filtered.tell() self.fd_filtered.seek(0) while self.fd_filtered.tell() < file_len: self.client.father.transport.write(self.fd_filtered.read(1024)) self.fd_orig.close() self.fd_filtered.close()
def thumb_crop_img(img, width=None, height=None, name='thumb.jpg'): """ Resizes image and crop him if it due proportions """ io = TemporaryFile() thumb = img.copy() thumb.thumbnail(image_width_height(img, width=width), Image.ANTIALIAS) if thumb.size[1] >= height: thumb = thumb.crop((0, 0, width, height)) else: thumb = thumb.resize((width, height), Image.ANTIALIAS) thumb.save(io, format='JPEG', quality=100) del thumb size = io.tell() io.seek(0) return InMemoryUploadedFile(io, None, name, 'image/jpeg', size, None)
def write_lines(self, key, lines): self._verify_key_format(key) storage = self.bucket.new_key(key + ".json.gz") buff = TemporaryFile() archive = gzip.GzipFile(fileobj=buff, mode='w') count = 0 for l in lines: if hasattr(l, "__iter__"): for ll in l: archive.write(ll.encode("utf8")) archive.write(b"\n") count += 1 else: archive.write(l.encode("utf8")) archive.write(b"\n") count += 1 archive.close() file_length = buff.tell() retry = 3 while retry: try: with Timer( "Sending {{count}} lines in {{file_length|comma}} bytes for {{key}}", { "key": key, "file_length": file_length, "count": count }, verbose=self.settings.debug): buff.seek(0) storage.set_contents_from_file(buff) break except Exception as e: e = Except.wrap(e) retry -= 1 if retry == 0 or 'Access Denied' in e or "No space left on device" in e: Log.error("could not push data to s3", cause=e) else: Log.warning("could not push data to s3", cause=e) if self.settings.public: storage.set_acl('public-read') return
def compress(self, path, writer, ext, mimetype): tf = TemporaryFile() writer(path, tf) tf.seek(0, io.SEEK_END) size = tf.tell() tf.seek(0) name = path.name if path.name else "top" resp = self.response_json(200, "OK") self.response = "200 OK" self.headers = [ ("Content-length", str(size)), ("Content-type", mimetype), ("Content-disposition", "attachment; filename=" + name + ext), ] self.result = FileWrapper(tf)
def test_dup_temp_file(selenium): # See https://github.com/emscripten-core/emscripten/issues/15012 import os from tempfile import TemporaryFile tf = TemporaryFile(buffering=0) fd1 = os.dup(tf.fileno()) os.dup2(tf.fileno(), 50) s = b"hello there!" tf.write(s) tf2 = open(fd1, "w+") assert tf2.tell() == len(s) # This next assertion actually demonstrates a bug in dup: the correct value # to return should be b"". assert os.read(fd1, 50) == b"" tf2.seek(1) assert tf.tell() == 1 assert tf.read(100) == b"ello there!"
def retr(ftp, path, retry=3): # retrieve file via FTP and return tmp = TemporaryFile() try: size = ftp.size(path) ftp.retrbinary('RETR ' + path, tmp.write) except: if retry > 0: tmp.close() return retr(ftp, path, retry-1) # recursively call retr until retry is 0 (retry times) else: tmp.close() return None if size != tmp.tell(): # check if downloaded file size != file size on ftp if retry > 0: tmp.close() return retr(ftp, path, retry-1) else: tmp.close() return None return tmp
class ContentReceiver: "Write-only file object used to receive data from FTP" def __init__(self,callback,*args): from tempfile import TemporaryFile self.data = TemporaryFile('w+b') self.callback = callback self.args = args def write(self,data): self.data.write(data) def close(self): size = self.data.tell() self.data.seek(0) args = self.args + (self.data, size) c = self.callback self.callback = None self.args = None c(*args)
class ContentReceiver: "Write-only file object used to receive data from FTP" def __init__(self, callback, *args): from tempfile import TemporaryFile self.data = TemporaryFile('w+b') self.callback = callback self.args = args def write(self, data): self.data.write(data) def close(self): size = self.data.tell() self.data.seek(0) args = self.args + (self.data, size) c = self.callback self.callback = None self.args = None c(*args)
def retr(ftp, path, retry=3): # retrieve file via FTP and return tmp = TemporaryFile() try: size = ftp.size(path) ftp.retrbinary('RETR ' + path, tmp.write) except: if retry > 0: tmp.close() return retr( ftp, path, retry - 1) # recursively call retr until retry is 0 (retry times) else: tmp.close() return None if size != tmp.tell(): # check if downloaded file size != file size on ftp if retry > 0: tmp.close() return retr(ftp, path, retry - 1) else: tmp.close() return None return tmp
def write_lines(self, key, lines): self._verify_key_format(key) storage = self.bucket.new_key(key + ".json.gz") buff = TemporaryFile() archive = gzip.GzipFile(fileobj=buff, mode='w') count = 0 for l in lines: if hasattr(l, "__iter__"): for ll in l: archive.write(ll.encode("utf8")) archive.write(b"\n") count += 1 else: archive.write(l.encode("utf8")) archive.write(b"\n") count += 1 archive.close() file_length = buff.tell() retry = 3 while retry: try: with Timer("Sending {{count}} lines in {{file_length|comma}} bytes", {"file_length": file_length, "count": count}, silent=not self.settings.debug): buff.seek(0) storage.set_contents_from_file(buff) break except Exception as e: e = Except.wrap(e) retry -= 1 if retry == 0 or 'Access Denied' in e or "No space left on device" in e: Log.error("could not push data to s3", cause=e) else: Log.warning("could not push data to s3", cause=e) if self.settings.public: storage.set_acl('public-read') return
def run(self): while self.isRunning: tmp = TemporaryFile() try: masscan = 'masscan' if config.OS == 'windows' else 'sudo masscan' self.sub = Popen(f'{masscan} {" ".join(self.parameters)}', stdout=tmp, stderr=STDOUT, shell=True) while self.sub.poll() is None: where = tmp.tell() lines = tmp.read() if not lines: sleep(0.01) tmp.seek(where) else: lines = lines.decode().split('\n') for line in lines: if line.startswith('Discovered'): self.analyse_queue.put( self._line_processing(line)) progress.increment('discover_total') elif line.startswith('rate'): state = re.findall(r'\d{1,2}.\d{1,2}%', line) state = ''.join(state).replace('%', '') self.sig.send_change_progressBar( int(float(state))) self.sig.change_actual_action.emit(line) sys.__stdout__.write((tmp.read()).decode()) sys.__stdout__.flush() except Exception as e: print(f'{__name__} - {e}') finally: self.sub.kill() progress.increment('action', value='discovering') self.sig.change_actual_action.emit('Discovering...') break
def write_lines(self, key, lines): self._verify_key_format(key) storage = self.bucket.new_key(key + ".json.gz") buff = TemporaryFile() archive = gzip.GzipFile(fileobj=buff, mode='w') count = 0 for l in lines: if hasattr(l, "__iter__"): for ll in l: archive.write(ll.encode("utf8")) archive.write(b"\n") count += 1 else: archive.write(l.encode("utf8")) archive.write(b"\n") count += 1 archive.close() file_length = buff.tell() retry = 3 while retry: try: with Timer( "Sending {{count}} lines in {{file_length|comma}} bytes", { "file_length": file_length, "count": count }, debug=self.settings.debug): buff.seek(0) storage.set_contents_from_file(buff) break except Exception, e: Log.warning("could not push data to s3", cause=e) retry -= 1
def index_html(self, icon=0, preview=0, width=None, height=None, as_attachment=False, REQUEST=None): """Return the file with it's corresponding MIME-type. @param as_attachment: if not None, return the file as an attachment using its title or id as a suggested filename; see RFC 2616 section 19.5.1 for more details """ if REQUEST is not None: if self._if_modified_since_request_handler(REQUEST): self.ZCacheable_set(None) return "" if self._redirect_default_view_request_handler(icon, preview, REQUEST): return "" filename, content_type, icon, preview = self._get_file_to_serve(icon, preview) filename = self._get_fsname(filename) if _debug > 1: LOG(_SUBSYS, INFO, "serving %s, %s, %s, %s" % (filename, content_type, icon, preview)) cant_read_exc = "Can't read: " if filename: try: size = os.stat(filename)[6] except: raise cant_read_exc, ("%s (%s)" % (self.id, filename)) else: filename = join(package_home(globals()), "icons", "broken.gif") try: size = os.stat(filename)[6] except: raise cant_read_exc, ("%s (%s)" % (self.id, filename)) content_type = "image/gif" icon = 1 if icon == 0 and width is not None and height is not None: data = TemporaryFile() # hold resized image try: from PIL import Image im = Image.open(filename) if im.mode != "RGB": im = im.convert("RGB") filter = Image.BICUBIC if hasattr(Image, "ANTIALIAS"): # PIL 1.1.3 filter = Image.ANTIALIAS im = im.resize((int(width), int(height)), filter) im.save(data, "JPEG", quality=85) except: data = open(filename, "rb") else: data.seek(0, 2) size = data.tell() data.seek(0) content_type = "image/jpeg" else: data = open(filename, "rb") close_data = 1 try: if REQUEST is not None: last_mod = rfc1123_date(self._p_mtime) if as_attachment: REQUEST.RESPONSE.setHeader( "Content-Disposition", 'attachment; filename="%s"' % (self.title_or_id(),) ) REQUEST.RESPONSE.setHeader("Last-Modified", last_mod) REQUEST.RESPONSE.setHeader("Content-Type", content_type) REQUEST.RESPONSE.setHeader("Content-Length", size) self.ZCacheable_set(None) # Support Zope 2.7.1 IStreamIterator if IStreamIterator is not None: close_data = 0 return stream_iterator(data) blocksize = 2 << 16 while 1: buffer = data.read(blocksize) REQUEST.RESPONSE.write(buffer) if len(buffer) < blocksize: break return "" else: return data.read() finally: if close_data: data.close()
class TestZipSubFile(unittest.TestCase): """ Tests ZipSubFile """ def setUp(self): self.zipper = ZipFile(ZIP_TEMP_FILE) self.subfile = ZipSubFile(self.zipper, FILE_NAME) self.subfile.open() # create a file in memory for comparison self.compare = TemporaryFile(prefix='oletools-test-ZipSubFile-', suffix='.bin') self.compare.write(FILE_CONTENTS) self.compare.seek(0) # re-position to start self.assertEqual(self.subfile.tell(), 0) self.assertEqual(self.compare.tell(), 0) if DEBUG: print('created comparison file {0!r} in memory'.format( self.compare.name)) def tearDown(self): self.compare.close() self.subfile.close() self.zipper.close() if DEBUG: print('\nall files closed') def test_read(self): """ test reading """ # read from start self.assertEqual(self.subfile.read(4), self.compare.read(4)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # read a bit more self.assertEqual(self.subfile.read(4), self.compare.read(4)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # create difference self.subfile.read(1) self.assertNotEqual(self.subfile.read(4), self.compare.read(4)) self.compare.read(1) self.assertEqual(self.subfile.tell(), self.compare.tell()) # read all the rest self.assertEqual(self.subfile.read(), self.compare.read()) self.assertEqual(self.subfile.tell(), self.compare.tell()) def test_seek_forward(self): """ test seeking forward """ self.subfile.seek(10) self.compare.seek(10) self.assertEqual(self.subfile.read(1), self.compare.read(1)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # seek 2 forward self.subfile.seek(2, os.SEEK_CUR) self.compare.seek(2, os.SEEK_CUR) self.assertEqual(self.subfile.read(1), self.compare.read(1)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # seek backward (only implemented case: back to start) self.subfile.seek(-self.subfile.tell(), os.SEEK_CUR) self.compare.seek(-self.compare.tell(), os.SEEK_CUR) self.assertEqual(self.subfile.read(1), self.compare.read(1)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # seek to end self.subfile.seek(0, os.SEEK_END) self.compare.seek(0, os.SEEK_END) self.assertEqual(self.subfile.tell(), self.compare.tell()) # seek back to start self.subfile.seek(0) self.compare.seek(0) self.assertEqual(self.subfile.tell(), self.compare.tell()) self.assertEqual(self.subfile.tell(), 0) def test_check_size(self): """ test usual size check: seek to end, tell, seek to start """ # seek to end self.subfile.seek(0, os.SEEK_END) self.assertEqual(self.subfile.tell(), len(FILE_CONTENTS)) # seek back to start self.subfile.seek(0) # read first few bytes self.assertEqual(self.subfile.read(10), FILE_CONTENTS[:10]) def test_error_read(self): """ test correct behaviour if read beyond end (no exception) """ self.subfile.seek(0, os.SEEK_END) self.compare.seek(0, os.SEEK_END) self.assertEqual(self.compare.read(10), self.subfile.read(10)) self.assertEqual(self.compare.tell(), self.subfile.tell()) self.subfile.seek(0) self.compare.seek(0) self.subfile.seek(len(FILE_CONTENTS) - 1) self.compare.seek(len(FILE_CONTENTS) - 1) self.assertEqual(self.compare.read(10), self.subfile.read(10)) self.assertEqual(self.compare.tell(), self.subfile.tell()) def test_error_seek(self): """ test correct behaviour if seek beyond end (no exception) """ self.subfile.seek(len(FILE_CONTENTS) + 10) self.compare.seek(len(FILE_CONTENTS) + 10)
def results_page(request, campaign_id=None): error_title = None error_message = None result_filter = None if campaign_id is not None: campaign = models.campaign.objects.get(id=campaign_id) else: campaign = None if request.method == 'GET' and 'view_output' in request.GET and \ 'view_all' not in request.GET and 'select_box' in request.GET: result_ids = map(int, dict(request.GET)['select_box']) results = models.result.objects.filter( id__in=result_ids).order_by('-id') else: if campaign_id is not None: campaign_items_ = campaign_items output_file = 'campaign-data/{}/gold_{}'.format( campaign_id, campaign.output_file) if exists(output_file) and guess_type(output_file)[0] is not None: output_file = True else: output_file = False results = campaign.result_set.all() else: campaign_items_ = None output_file = True results = models.result.objects.all() result_filter = filters.result(request.GET, queryset=results) if not result_filter.qs.count() and results.count(): error_title = 'Filter Error' error_message = 'Filter did not return any results and was ignored.' result_filter = filters.result(None, queryset=results) else: results = result_filter.qs.order_by('-id') if request.method == 'GET' and 'view_output' in request.GET: if 'view_dut_output' in request.GET: if 'view_download' in request.GET: temp_file = TemporaryFile() start = perf_counter() with open_tar(fileobj=temp_file, mode='w:gz') as archive: for result in results: with BytesIO(result.dut_output.encode('utf-8')) as \ byte_file: info = TarInfo('{}_dut_output.txt'.format( result.id)) info.size = len(result.dut_output) archive.addfile(info, byte_file) print('archive created', round(perf_counter() - start, 2), 'seconds') response = FileResponse( temp_file, content_type='application/x-compressed') response['Content-Disposition'] = \ 'attachment; filename=dut_outputs.tar.gz' response['Content-Length'] = temp_file.tell() temp_file.seek(0) return response else: return render( request, 'output.html', { 'campaign': campaign, 'campaign_items': campaign_items if campaign else None, 'navigation_items': navigation_items, 'results': results, 'type': 'dut_output' }) elif 'view_aux_output' in request.GET: if 'view_download' in request.GET: temp_file = TemporaryFile() start = perf_counter() with open_tar(fileobj=temp_file, mode='w:gz') as archive: for result in results: with BytesIO(result.aux_output.encode('utf-8')) as \ byte_file: info = TarInfo('{}_aux_output.txt'.format( result.id)) info.size = len(result.aux_output) archive.addfile(info, byte_file) print('archive created', round(perf_counter() - start, 2), 'seconds') response = FileResponse( temp_file, content_type='application/x-compressed') response['Content-Disposition'] = \ 'attachment; filename=aux_outputs.tar.gz' response['Content-Length'] = temp_file.tell() temp_file.seek(0) return response else: return render( request, 'output.html', { 'campaign': campaign, 'campaign_items': campaign_items if campaign else None, 'navigation_items': navigation_items, 'results': results, 'type': 'aux_output' }) elif 'view_debugger_output' in request.GET: if 'view_download' in request.GET: temp_file = TemporaryFile() start = perf_counter() with open_tar(fileobj=temp_file, mode='w:gz') as archive: for result in results: with BytesIO( result.debugger_output.encode('utf-8')) as \ byte_file: info = TarInfo('{}_debugger_output.txt'.format( result.id)) info.size = len(result.debugger_output) archive.addfile(info, byte_file) print('archive created', round(perf_counter() - start, 2), 'seconds') response = FileResponse( temp_file, content_type='application/x-compressed') response['Content-Disposition'] = \ 'attachment; filename=debugger_outputs.tar.gz' response['Content-Length'] = temp_file.tell() temp_file.seek(0) return response else: return render( request, 'output.html', { 'campaign': campaign, 'campaign_items': campaign_items if campaign else None, 'navigation_items': navigation_items, 'results': results, 'type': 'debugger_output' }) elif 'view_output_file' in request.GET: result_ids = [] for result in results: if exists('campaign-data/{}/results/{}/{}'.format( result.campaign_id, result.id, result.campaign.output_file)): result_ids.append(result.id) results = models.result.objects.filter( id__in=result_ids).order_by('-id') if 'view_download' in request.GET: temp_file = TemporaryFile() start = perf_counter() with open_tar(fileobj=temp_file, mode='w:gz') as archive: for result in results: archive.add( 'campaign-data/{}/results/{}/{}'.format( result.campaign_id, result.id, result.campaign.output_file), '{}_{}'.format(result.id, result.campaign.output_file)) print('archive created', round(perf_counter() - start, 2), 'seconds') response = FileResponse( temp_file, content_type='application/x-compressed') response['Content-Disposition'] = \ 'attachment; filename=output_files.tar.gz' response['Content-Length'] = temp_file.tell() temp_file.seek(0) return response else: return render( request, 'output.html', { 'campaign': campaign, 'campaign_items': campaign_items if campaign else None, 'navigation_items': navigation_items, 'results': results, 'type': 'output_file' }) elif 'view_log_file' in request.GET: if 'view_download' in request.GET: temp_file = TemporaryFile() start = perf_counter() with open_tar(fileobj=temp_file, mode='w:gz') as archive: for result in results: for log_file in result.campaign.log_files: archive.add( 'campaign-data/{}/results/{}/{}'.format( result.campaign_id, result.id, log_file), '{}_{}'.format(result.id, log_file)) print('archive created', round(perf_counter() - start, 2), 'seconds') response = FileResponse( temp_file, content_type='application/x-compressed') response['Content-Disposition'] = \ 'attachment; filename=log_files.tar.gz' response['Content-Length'] = temp_file.tell() temp_file.seek(0) return response else: return render( request, 'output.html', { 'campaign': campaign, 'campaign_items': campaign_items if campaign else None, 'navigation_items': navigation_items, 'results': results, 'type': 'log_file' }) elif request.method == 'POST': if 'new_outcome_category' in request.POST: results.values('outcome_category').update( outcome_category=request.POST['new_outcome_category']) elif 'new_outcome' in request.POST: results.values('outcome').update( outcome=request.POST['new_outcome']) elif 'delete' in request.POST and 'results[]' in request.POST: result_ids = [ int(result_id) for result_id in dict(request.POST)['results[]'] ] results_to_delete = models.result.objects.filter(id__in=result_ids) for result in results_to_delete: if exists('campaign-data/{}/results/{}'.format( result.campaign_id, result.id)): rmtree('campaign-data/{}/results/{}'.format( result.campaign_id, result.id)) results_to_delete.delete() elif 'delete_all' in request.POST: for result in results: if exists('campaign-data/{}/results/{}'.format( result.campaign_id, result.id)): rmtree('campaign-data/{}/results/{}'.format( result.campaign_id, result.id)) results.delete() if campaign_id: return redirect('/campaign/{}/results'.format(campaign_id)) else: return redirect('/results') result_table = tables.results(results) RequestConfig(request, paginate={ 'per_page': table_length }).configure(result_table) return render( request, 'results.html', { 'campaign': campaign, 'campaign_items': campaign_items_, 'error_message': error_message, 'error_title': error_title, 'filter': result_filter, 'filter_tabs': True, 'navigation_items': navigation_items, 'output_file': output_file, 'result_count': '{:,}'.format(results.count()), 'result_table': result_table })
class ZipNumClusterJob(MRJob): HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.CombineTextInputFormat' PARTITIONER = 'org.apache.hadoop.mapred.lib.TotalOrderPartitioner' INPUT_PROTOCOL = RawValueProtocol OUTPUT_PROTOCOL = RawValueProtocol INTERNAL_PROTOCOL = RawProtocol JOBCONF = {'mapreduce.task.timeout': '9600000', 'mapreduce.input.fileinputformat.split.maxsize': '50000000', 'mapreduce.map.speculative': 'false', 'mapreduce.reduce.speculative': 'false', 'mapreduce.output.fileoutputformat.compress': 'false', 'mapreduce.job.reduce.slowstart.completedmaps': '0.8', 'mapreduce.job.jvm.numtasks': '-1' } def configure_options(self): """Custom command line options for indexing""" super(ZipNumClusterJob, self).configure_options() self.add_passthrough_option('--numlines', dest='numlines', type=int, default=3000, help='Number of lines per gzipped block') self.add_passthrough_option('--splitfile', dest='splitfile', help='Split file to use for CDX shard split') self.add_passthrough_option('--convert', dest='convert', action='store_true', default=False, help='Convert CDX through _convert_line() function') self.add_passthrough_option('--shards', dest='shards', type=int, help='Num ZipNum Shards to create, ' + '= num of entries in splits + 1' + '= num of reducers used') def jobconf(self): orig_jobconf = super(ZipNumClusterJob, self).jobconf() custom_jobconf = {'mapreduce.job.reduces': self.options.shards, 'mapreduce.totalorderpartitioner.path': self.options.splitfile} combined = combine_dicts(orig_jobconf, custom_jobconf) return combined def mapper_init(self): pass def mapper(self, _, line): line = line.split('\t')[-1] if not line.startswith(' CDX'): if self.options.convert: line = self._convert_line(line) yield line, '' def _convert_line(self, line): key, ts, url, length, offset, warc = line.split(' ') key = key.replace(')', ',)', 1) vals = {'o': offset, 's': length, 'w': warc, 'u': url} return key + ' ' + ts + ' ' + json.dumps(vals) def _get_prop(self, proplist): for p in proplist: res = os.environ.get(p) if res: return res def reducer_init(self): self.curr_lines = [] self.curr_key = '' self.part_num = self._get_prop(['mapreduce_task_partition', 'mapred_task_partition']) assert(self.part_num) self.part_name = 'cdx-%05d.gz' % int(self.part_num) self.output_dir = self._get_prop(['mapreduce_output_fileoutputformat_outputdir', 'mapred.output.dir', 'mapred_work_output_dir']) assert(self.output_dir) self.gzip_temp = TemporaryFile(mode='w+b') def reducer(self, key, values): if key: self.curr_lines.append(key) for x in values: if x: self.curr_lines.append(x) if len(self.curr_lines) == 1: self.curr_key = ' '.join(key.split(' ', 2)[0:2]) if len(self.curr_lines) >= self.options.numlines: yield '', self._write_part() def reducer_final(self): if len(self.curr_lines) > 0: yield '', self._write_part() self._do_upload() def _do_upload(self): self.gzip_temp.flush() #TODO: move to generalized put() function if self.output_dir.startswith('s3://'): import boto conn = boto.connect_s3() parts = urlparse.urlsplit(self.output_dir) bucket = conn.lookup(parts.netloc) cdxkey = bucket.new_key(parts.path + '/' + self.part_name) cdxkey.set_contents_from_file(self.gzip_temp, rewind=True) else: path = os.path.join(self.output_dir, self.part_name) self.gzip_temp.seek(0) with open(path, 'w+b') as target: shutil.copyfileobj(self.gzip_temp, target) self.gzip_temp.close() def _write_part(self): z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16) offset = self.gzip_temp.tell() buff = '\n'.join(self.curr_lines) + '\n' self.curr_lines = [] buff = z.compress(buff) self.gzip_temp.write(buff) buff = z.flush() self.gzip_temp.write(buff) self.gzip_temp.flush() length = self.gzip_temp.tell() - offset partline = '{0}\t{1}\t{2}\t{3}'.format(self.curr_key, self.part_name, offset, length) return partline
files ='' dicts = [] for fil in files: dic = json.load(open(fil)) dicts.append(dic) if 'dimensions' in dic: dim = dic['dimensions'] # dim += np.random.randint(0,10000000000000,[len(dim)]) # just to check - should be 6-8x longer than dim if len(unique(diff(dim))) > 1: print('{fil} has at least {n:,}/{t:,} anomalies' .format(fil=fil, n=len(unique(diff(dim))), t=len(dim))) else: print('{fil} is clean!!'.format(fil=fil)) outfile = TemporaryFile() savez(outfile, dim=diff(dim)) print('compressed file length = {l:,}/{raw:,} bytes'.format(l=outfile.tell(),raw=8*len(dim))) """ #pyfusion.config.set('global','localdatapath','local_data') # this request translates to a json file which is stored locally - see below for complete example # need to disable all networks xx=dev.acq.getdata([20160302,23],'W7X_L53_LP10_I',no_cache=1) http://archive-webapi.ipp-hgw.mpg.de/ArchiveDB/codac/W7X/CoDaStationDesc.82/DataModuleDesc.190_DATASTREAM/5/Channel_5/scaled/_signal.json?from=1457536002136103981&upto=1457536069136103980 # complete example - assuming you have a cache under the working directory e.g. # /home/bdb112/pyfusion/working/pyfusion/archive-webapi.ipp-hgw.mpg.de/ArchiveDB/codac/W7X/CoDaStationDesc.82/DataModuleDesc.190_DATASTREAM/5/Channel_5/scaled/_signal.json?from=1456930821345103981&upto=1456930888345103980 # import pyfusion pyfusion.LAST_DNS_TEST=-1 pyfusion.CACHE=1
def parseMultipart(fp, pdict, memfile_max=1024 * 1000, len_max=0): """ Parse multipart content """ # TODO: Do not store whole parts contents in the memoty boundary = '' if 'boundary' in pdict: boundary = pdict['boundary'] if not isBoundaryValid(boundary): raise ValueError('Invalid boundary in multipart form: {0}' . format(boundary)) maxlen = 0 nextpart = b'--' + boundary.encode() lastpart = b'--' + boundary.encode() + b'--' partdict = {} terminator = b'' while terminator != lastpart: nbytes = -1 data = None if terminator: # At start of next part. Read headers first. headers = parse_headers(fp, memfile_max) clength = headers.get('content-length') if clength is not None: try: nbytes = int(clength) except ValueError: pass if nbytes > 0: if maxlen and nbytes > len_max: raise ValueError('Maximum content length exceeded') data = fp.read(nbytes) else: data = b'' # Read lines until end of part. part_fp = TemporaryFile(mode='w+b') while 1: line = fp.readline(memfile_max) if line == b'': terminator = lastpart # End outer loop break if _is_termline(line, nextpart): terminator = nextpart break if _is_termline(line, lastpart): terminator = lastpart break part_fp.write(line) while not line.endswith(b"\n"): line = fp.readline(memfile_max) if line == b'': break part_fp.write(line) # Done with part. if data is None: continue if nbytes < 0: last = pre_last = None # Strip final line terminator if part_fp.tell() >= 1: part_fp.seek(-1, os.SEEK_END) last = part_fp.read(1) if part_fp.tell() >= 2: part_fp.seek(-2, os.SEEK_END) pre_last = part_fp.read(1) trunc = 0 if pre_last == b"\r" and last == b"\n": trunc = 2 elif last == b"\n": trunk = 1 if trunc > 0: part_fp.seek(-trunc, os.SEEK_END) part_fp.truncate() line = headers['content-disposition'] if not line: continue key, params = parse_header(line) if key != 'form-data': continue if 'name' in params: name = params['name'] else: continue part_fp.seek(0, os.SEEK_SET) part = {'fp': part_fp} if 'filename' in params: part['filename'] = params['filename'] if name in partdict: partdict[name].append(part) else: partdict[name] = [part] return partdict
class Agent: """ Each agent object contains its own sampled log data. The Agent class is responsible for collecting and storing data. machine_id is used to identify the agent. machine_id is supplied by the client class. This allows for multiple agents if desired """ def __init__(self, arguments, machine_id): self.arguments = arguments self.my_uuid = machine_id self.track_process = '' # This log object is for stdout purposes self.log = TemporaryFile() self.log_position = 0 # Discover if --recover is being used. If so, we need to obtain the # timestamp of the last entry in the outfile log... a little bulky # to do... and not a very good place to do it. if self.arguments.recover: if os.path.exists(self.arguments.outfile[-1]): memory_list = [] history_file = open(self.arguments.outfile[-1], 'r') reader = csv.reader(history_file, delimiter=',', quotechar='|', escapechar='\\', quoting=csv.QUOTE_MINIMAL) # Get last item in list. Unfortunately, no way to do this until # we have read the entire file...? Lucky for us, most memory log # files are in the single digit megabytes for row in reader: memory_list.append(row) history_file.close() last_entry = float(memory_list[-1][0]) + self.arguments.repeat_rate[-1] self.delta = (GetTime().now - last_entry) else: print 'Recovery options detected, but I could not find your previous memory log file.' sys.exit(1) else: self.delta = 0 # Create the dictionary to which all sampled data will be stored # NOTE: REQUEST dictionary items are instructions (arguments) we will # ask the server to provide (if we are running with --pbs) # Simply add them here. We _can not_ make the arguments match the # server exactly, this would cause every agent launched to perform # like a server... bad stuff # Example: We added repeat_rate (see dictionary below). Now every # agent would update their repeat_rate according to what the user # supplied as an argument (--repeat_rate 0.02) self.agent_data = { self.my_uuid : { 'HOSTNAME' : socket.gethostname(), 'STDOUT' : '', 'STACK' : '', 'MEMORY' : 0, 'TIMESTAMP' : GetTime().now - self.delta, 'REQUEST' : { 'run' : '', 'pstack' : '', 'repeat_rate' : '', 'cwd' : ''}, 'STOP' : False, 'TOTAL' : 0, 'DEBUG_LOG' : '' } } # NOTE: This is the only function that should be called in this class def takeSample(self): if self.arguments.pstack: self.agent_data[self.my_uuid]['STACK'] = self._getStack() # Always do the following self.agent_data[self.my_uuid]['MEMORY'] = self._getMemory() self.agent_data[self.my_uuid]['STDOUT'] = self._getStdout() if self.arguments.recover: self.agent_data[self.my_uuid]['TIMESTAMP'] = GetTime().now - self.delta else: self.agent_data[self.my_uuid]['TIMESTAMP'] = GetTime().now # Return the data to whom ever asked for it return self.agent_data def _getStdout(self): self.log.seek(self.log_position) output = self.log.read() self.log_position = self.log.tell() sys.stdout.write(output) return output def _getMemory(self): tmp_pids = self._getPIDs() memory_usage = 0 if tmp_pids != {}: for single_pid in tmp_pids.iteritems(): memory_usage += int(single_pid[1][0]) if memory_usage == 0: # Memory usage hit zero? Then assume the binary being tracked has exited. So lets begin doing the same. self.agent_data[self.my_uuid]['DEBUG_LOG'] = 'I found the total memory usage of all my processes hit 0. Stoping' self.agent_data[self.my_uuid]['STOP'] = True return 0 return int(memory_usage) # No binay even detected? Lets assume it exited, so we should begin doing the same. self.agent_data[self.my_uuid]['STOP'] = True self.agent_data[self.my_uuid]['DEBUG_LOG'] = 'I found no processes running. Stopping' return 0 def _getStack(self): if self._darwin() == True: stack_trace = LLDB() else: stack_trace = GDB() tmp_pids = self._getPIDs() if tmp_pids != {}: last_pid = sorted([x for x in tmp_pids.keys()])[-1] return stack_trace.getStackTrace(str(last_pid)) else: return '' def _getPIDs(self): pid_list = {} # Determin the binary to sample and store it. Doing the findCommand is a little expensive. if self.track_process == '': self.track_process = self._findCommand(''.join(self.arguments.run)) # A quick way to safely check for the avilability of needed tools self._verifyCommand(['ps']) # If we are tracking a binary if self.arguments.run: command = [which('ps'), '-e', '-o', 'pid,rss,user,args'] tmp_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) all_pids = tmp_proc.communicate()[0].split('\n') # Figure out what we are allowed to track (strip away mpiexec, processes not owned by us, etc) for single_pid in all_pids: if single_pid.find(self.track_process) != -1 and \ single_pid.find(__file__) == -1 and \ single_pid.find('mpirun') == -1 and \ single_pid.find(os.getenv('USER')) != -1 and \ single_pid.find('mpiexec') == -1: pid_list[int(single_pid.split()[0])] = [] pid_list[int(single_pid.split()[0])].extend([single_pid.split()[1], single_pid.split()[3]]) return pid_list def _verifyCommand(self, command_list): for command in command_list: if which(command) == None: print 'Command not found:', command sys.exit(1) # determine if we are running on a darwin kernel def _darwin(self): if platform.platform(0, 1).split('-')[:-1][0].find('Darwin') != -1: return True # Determine the command we are going to track # A few things are happening here; first we strip off any MPI commands # we then loop through the remaining items until we find a matching path # exp: mpiexec -n 12 ../../../moose_test-opt -i simple_diffusion.i -r 6 # would first strip off mpiexec, check for the presence of -n in our # current directory, then 12, then ../../../moose_test-opt <- found. It would # stop and return the base name (moose_test-opt). def _findCommand(self, command): if command.find('mpiexec') == 0 or command.find('mpirun') == 0: for binary in command.split(): if os.path.exists(binary): return os.path.split(binary)[1] elif os.path.exists(command.split()[0]): return os.path.split(command.split()[0])[1]
def transfer_yaml(): print(" * Transferring yml") upload_folder = os.path.join(app.root_path, app.config['UPLOAD_FOLDER']) if request.method == 'GET': tarfile_backend = TemporaryFile(mode='wb+') yamlfile = TemporaryFile(mode='wb+') tarball = tarfile.open(fileobj=tarfile_backend, mode='w') yamlfile.write( bytes( export_challenges('export.yaml', 'export.d', upload_folder, tarball, False), "UTF-8")) tarinfo = tarfile.TarInfo('export.yaml') tarinfo.size = yamlfile.tell() yamlfile.seek(0) tarball.addfile(tarinfo, yamlfile) tarball.close() yamlfile.close() gzipfile_backend = TemporaryFile(mode='wb+') gzipfile = GzipFile(fileobj=gzipfile_backend, mode='wb') tarfile_backend.seek(0) shutil.copyfileobj(tarfile_backend, gzipfile) tarfile_backend.close() gzipfile.close() gzipfile_backend.seek(0) return send_file(gzipfile_backend, as_attachment=True, attachment_filename='export.tar.gz') if request.method == 'POST': if 'file' not in request.files: abort(400) file = request.files['file'] readmode = 'r:gz' if file.filename.endswith('.tar'): readmode = 'r' if file.filename.endswith('.bz2'): readmode = 'r:bz2' tempdir = mkdtemp() try: archive = tarfile.open(fileobj=file.stream, mode=readmode) archive.extractall(path=tempdir) events = import_challenges(tempdir, upload_folder, move=True) except tarfile.TarError: print('b') abort(400) finally: shutil.rmtree(tempdir) return json.dumps( {'log': [{ 'msg': e.msg, 'type': e.type } for e in events]})
def parseMultipart(fp, pdict, memfile_max=1024 * 1000, len_max=0): """ Parse multipart content """ # TODO: Do not store whole parts contents in the memoty boundary = '' if 'boundary' in pdict: boundary = pdict['boundary'] if not isBoundaryValid(boundary): raise ValueError( 'Invalid boundary in multipart form: {0}'.format(boundary)) maxlen = 0 nextpart = b'--' + boundary.encode() lastpart = b'--' + boundary.encode() + b'--' partdict = {} terminator = b'' while terminator != lastpart: nbytes = -1 data = None if terminator: # At start of next part. Read headers first. headers = parse_headers(fp, memfile_max) clength = headers.get('content-length') if clength is not None: try: nbytes = int(clength) except ValueError: pass if nbytes > 0: if maxlen and nbytes > len_max: raise ValueError('Maximum content length exceeded') data = fp.read(nbytes) else: data = b'' # Read lines until end of part. part_fp = TemporaryFile(mode='w+b') while 1: line = fp.readline(memfile_max) if line == b'': terminator = lastpart # End outer loop break if _is_termline(line, nextpart): terminator = nextpart break if _is_termline(line, lastpart): terminator = lastpart break part_fp.write(line) while not line.endswith(b"\n"): line = fp.readline(memfile_max) if line == b'': break part_fp.write(line) # Done with part. if data is None: continue if nbytes < 0: last = pre_last = None # Strip final line terminator if part_fp.tell() >= 1: part_fp.seek(-1, os.SEEK_END) last = part_fp.read(1) if part_fp.tell() >= 2: part_fp.seek(-2, os.SEEK_END) pre_last = part_fp.read(1) trunc = 0 if pre_last == b"\r" and last == b"\n": trunc = 2 elif last == b"\n": trunk = 1 if trunc > 0: part_fp.seek(-trunc, os.SEEK_END) part_fp.truncate() line = headers['content-disposition'] if not line: continue key, params = parse_header(line) if key != 'form-data': continue if 'name' in params: name = params['name'] else: continue part_fp.seek(0, os.SEEK_SET) part = {'fp': part_fp} if 'filename' in params: part['filename'] = params['filename'] if name in partdict: partdict[name].append(part) else: partdict[name] = [part] return partdict
with app.app_context(): args = process_args(args) from CTFd.models import db app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False db.init_app(app) app.db = db out_stream.write( export_challenges(args.out_file, args.dst_attachments, args.src_attachments, args.visible_only, args.remove_flags, tarfile)) if args.tar: print("Tarballing exported files") tarinfo = TarInfo(args.out_file) tarinfo.size = out_stream.tell() out_stream.seek(0) tarfile.addfile(tarinfo, out_stream) tarfile.close() if args.gz: print("Compressing tarball with gzip") with gzip.open('export.tar.gz', 'wb') as gz: tempfile.seek(0) shutil.copyfileobj(tempfile, gz) out_stream.close()
def index_html(self, icon=0, preview=0, width=None, height=None, REQUEST=None): """ Return the file with it's corresponding MIME-type """ if REQUEST is not None: if self._if_modified_since_request_handler(REQUEST): self.ZCacheable_set(None) return '' if self._redirect_default_view_request_handler( icon, preview, REQUEST): return '' filename, content_type, icon, preview = self._get_file_to_serve( icon, preview) filename = self._get_fsname(filename) if _debug > 1: logger.info('serving %s, %s, %s, %s' % (filename, content_type, icon, preview)) if filename: size = os.stat(filename)[6] else: filename = self._get_icon_file(broken=True) size = os.stat(filename)[6] content_type = 'image/gif' icon = 1 if icon == 0 and width is not None and height is not None: data = TemporaryFile() # hold resized image try: from PIL import Image im = Image.open(filename) if im.mode != 'RGB': im = im.convert('RGB') filter = Image.BICUBIC if hasattr(Image, 'ANTIALIAS'): # PIL 1.1.3 filter = Image.ANTIALIAS im = im.resize((int(width), int(height)), filter) im.save(data, 'JPEG', quality=85) except: data = open(filename, 'rb') else: data.seek(0, 2) size = data.tell() data.seek(0) content_type = 'image/jpeg' else: data = open(filename, 'rb') if REQUEST is not None: last_mod = rfc1123_date(self._p_mtime) REQUEST.RESPONSE.setHeader('Last-Modified', last_mod) REQUEST.RESPONSE.setHeader('Content-Type', content_type) REQUEST.RESPONSE.setHeader('Content-Length', size) self.ZCacheable_set(None) return stream_iterator(data) try: return data.read() finally: data.close()
class TestZipSubFile(unittest.TestCase): """ Tests ZipSubFile """ def setUp(self): self.zipper = ZipFile(ZIP_TEMP_FILE) self.subfile = ZipSubFile(self.zipper, FILE_NAME) self.subfile.open() # create a file in memory for comparison self.compare = TemporaryFile(prefix='oletools-test-ZipSubFile-', suffix='.bin') self.compare.write(FILE_CONTENTS) self.compare.seek(0) # re-position to start self.assertEqual(self.subfile.tell(), 0) self.assertEqual(self.compare.tell(), 0) if DEBUG: print('created comparison file {0!r} in memory' .format(self.compare.name)) def tearDown(self): self.compare.close() self.subfile.close() self.zipper.close() if DEBUG: print('\nall files closed') def test_read(self): """ test reading """ # read from start self.assertEqual(self.subfile.read(4), self.compare.read(4)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # read a bit more self.assertEqual(self.subfile.read(4), self.compare.read(4)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # create difference self.subfile.read(1) self.assertNotEqual(self.subfile.read(4), self.compare.read(4)) self.compare.read(1) self.assertEqual(self.subfile.tell(), self.compare.tell()) # read all the rest self.assertEqual(self.subfile.read(), self.compare.read()) self.assertEqual(self.subfile.tell(), self.compare.tell()) def test_seek_forward(self): """ test seeking forward """ self.subfile.seek(10) self.compare.seek(10) self.assertEqual(self.subfile.read(1), self.compare.read(1)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # seek 2 forward self.subfile.seek(2, os.SEEK_CUR) self.compare.seek(2, os.SEEK_CUR) self.assertEqual(self.subfile.read(1), self.compare.read(1)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # seek backward (only implemented case: back to start) self.subfile.seek(-self.subfile.tell(), os.SEEK_CUR) self.compare.seek(-self.compare.tell(), os.SEEK_CUR) self.assertEqual(self.subfile.read(1), self.compare.read(1)) self.assertEqual(self.subfile.tell(), self.compare.tell()) # seek to end self.subfile.seek(0, os.SEEK_END) self.compare.seek(0, os.SEEK_END) self.assertEqual(self.subfile.tell(), self.compare.tell()) # seek back to start self.subfile.seek(0) self.compare.seek(0) self.assertEqual(self.subfile.tell(), self.compare.tell()) self.assertEqual(self.subfile.tell(), 0) def test_check_size(self): """ test usual size check: seek to end, tell, seek to start """ # seek to end self.subfile.seek(0, os.SEEK_END) self.assertEqual(self.subfile.tell(), len(FILE_CONTENTS)) # seek back to start self.subfile.seek(0) # read first few bytes self.assertEqual(self.subfile.read(10), FILE_CONTENTS[:10]) def test_error_read(self): """ test correct behaviour if read beyond end (no exception) """ self.subfile.seek(0, os.SEEK_END) self.compare.seek(0, os.SEEK_END) self.assertEquals(self.compare.read(10), self.subfile.read(10)) self.assertEquals(self.compare.tell(), self.subfile.tell()) self.subfile.seek(0) self.compare.seek(0) self.subfile.seek(len(FILE_CONTENTS) - 1) self.compare.seek(len(FILE_CONTENTS) - 1) self.assertEquals(self.compare.read(10), self.subfile.read(10)) self.assertEquals(self.compare.tell(), self.subfile.tell()) def test_error_seek(self): """ test correct behaviour if seek beyond end (no exception) """ self.subfile.seek(len(FILE_CONTENTS) + 10) self.compare.seek(len(FILE_CONTENTS) + 10)
class S3File(io.IOBase): """File like proxy for s3 files, manages upload and download of locally managed temporary file """ def __init__(self, bucket, key, mode='w+b', *args, **kwargs): super(S3File, self).__init__(*args, **kwargs) self.bucket = bucket self.key = key self.mode = mode self.path = self.bucket + '/' + self.key # converts mode to readable/writable to enable the temporary file to have S3 data # read or written to it even if the S3File is read/write/append # i.e. "r" => "r+", "ab" => "a+b" updatable_mode = re.sub(r'^([rwa]+)(b?)$', r'\1+\2', mode) self._tempfile = TemporaryFile(updatable_mode) try: with s3errors(self.path): if 'a' in mode: # File is in an appending mode, start with the content in file s3.Object(bucket, key).download_fileobj(self._tempfile) self.seek(0, os.SEEK_END) elif 'a' not in mode and 'w' not in mode and 'x' not in mode: # file is not in a create mode, so it is in read mode # start with the content in the file, and seek to the beginning s3.Object(bucket, key).download_fileobj(self._tempfile) self.seek(0, os.SEEK_SET) except Exception: self.close() raise def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): try: if self.writable(): self.seek(0) with s3errors(self.path): s3.Object(self.bucket, self.key).upload_fileobj(self._tempfile) finally: self._tempfile.close() @property def closed(self): return self._tempfile.closed def fileno(self): return self._tempfile.fileno() def flush(self): return self._tempfile.flush() def isatty(self): return self._tempfile.isatty() def readable(self): return 'r' in self.mode or '+' in self.mode def read(self, n=-1): if not self.readable(): raise IOError('not open for reading') return self._tempfile.read(n) def readinto(self, b): return self._tempfile.readinto(b) def readline(self, limit=-1): if not self.readable(): raise IOError('not open for reading') return self._tempfile.readline(limit) def readlines(self, hint=-1): if not self.readable(): raise IOError('not open for reading') return self._tempfile.readlines(hint) def seek(self, offset, whence=os.SEEK_SET): self._tempfile.seek(offset, whence) return self.tell() def seekable(self): return True def tell(self): return self._tempfile.tell() def writable(self): return 'w' in self.mode or 'a' in self.mode or '+' in self.mode or 'x' in self.mode def write(self, b): if not self.writable(): raise IOError('not open for writing') self._tempfile.write(b) return len(b) def writelines(self, lines): if not self.writable(): raise IOError('not open for writing') return self._tempfile.writelines(lines) def truncate(self, size=None): if not self.writable(): raise IOError('not open for writing') if size is None: size = self.tell() self._tempfile.truncate(size) return size
def transfer_yaml(): upload_folder = os.path.join(app.root_path, app.config['UPLOAD_FOLDER']) if request.method == 'GET': tarfile_backend = TemporaryFile(mode='wb+') yamlfile = TemporaryFile(mode='wb+') tarball = tarfile.open(fileobj=tarfile_backend, mode='w') print(upload_folder) yamlfile.write(export_challenges('export.yaml', 'export.d', upload_folder, tarball)) tarinfo = tarfile.TarInfo('export.yaml') tarinfo.size = yamlfile.tell() yamlfile.seek(0) tarball.addfile(tarinfo, yamlfile) tarball.close() yamlfile.close() gzipfile_backend = TemporaryFile(mode='wb+') gzipfile = GzipFile(fileobj=gzipfile_backend) tarfile_backend.seek(0) shutil.copyfileobj(tarfile_backend, gzipfile) tarfile_backend.close() gzipfile.close() gzipfile_backend.seek(0) return send_file(gzipfile_backend, as_attachment=True, attachment_filename='export.tar.gz') if request.method == 'POST': if 'file' not in request.files: abort(400) file = request.files['file'] readmode = 'r:gz' if file.filename.endswith('.tar'): readmode = 'r' if file.filename.endswith('.bz2'): readmode = 'r:bz2' tempdir = mkdtemp() try: archive = tarfile.open(fileobj=file.stream, mode=readmode) if 'export.yaml' not in archive.getnames(): shutil.rmtree(tempdir) abort(400) # Check for atttempts to escape to higher dirs for member in archive.getmembers(): memberpath = os.path.normpath(member.name) if memberpath.startswith('/') or '..' in memberpath.split('/'): shutil.rmtree(tempdir) abort(400) if member.linkname: linkpath = os.path.normpath(member.linkname) if linkpath.startswith('/') or '..' in linkpath.split('/'): shutil.rmtree(tempdir) abort(400) archive.extractall(path=tempdir) except tarfile.TarError: shutil.rmtree(tempdir) print('b') abort(400) in_file = os.path.join(tempdir, 'export.yaml') import_challenges(in_file, upload_folder, move=True) shutil.rmtree(tempdir) return '1'
class ExpandableOutput(Reader, Writer, Seeker): """ Write-only output object. Will store data in a BytesIO, until more than ``bufsize`` bytes are written, at which point it will switch to storing data in a real file object. """ def __init__(self, bufsize=16384): """ Initialize an ``ExpandableOutput`` instance. """ self._raw = BytesIO() self.bufsize = bufsize self.write = self.write_stringio self.exceeded_bufsize = False def getstorage(self): """\ Return the underlying stream (either a BytesIO or file object) """ return self._raw def seek(self, pos, whence=0): return self._raw.seek(pos, whence) def tell(self): return self._raw.tell() def read(self, size=-1): return self._raw.read(size) def readline(self, size=-1): return self._raw.read(size) def write_stringio(self, data): """ ``write``, optimized for the BytesIO backend. """ if isinstance(self._raw, BytesIO) \ and self._raw.tell() + len(data) > self.bufsize: self.switch_to_file_storage() return self.write_file(data) return self._raw.write(data) def write_file(self, data): """ ``write``, optimized for the TemporaryFile backend """ return self._raw.write(data) def switch_to_file_storage(self): """ Switch the storage backend to an instance of ``TemporaryFile``. """ self.exceeded_bufsize = True oldio = self._raw try: self._raw.seek(0) self._raw = TemporaryFile() copyfileobj(oldio, self._raw) finally: oldio.close() self.write = self.write_file def __enter__(self): """ Support for context manager ``__enter__``/``__exit__`` blocks """ return self def __exit__(self, type, value, traceback): """ Support for context manager ``__enter__``/``__exit__`` blocks """ self._raw.close() # propagate exceptions return False
def result_page(request, result_id): result = models.result.objects.get(id=result_id) if request.method == 'GET': if 'get_dut_output' in request.GET: response = HttpResponse(result.dut_output, content_type='text/plain') response['Content-Disposition'] = \ 'attachment; filename="{}_dut_output.txt"'.format( result_id) return response elif 'get_debugger_output' in request.GET: response = HttpResponse(result.debugger_output, content_type='text/plain') response['Content-Disposition'] = \ 'attachment; filename="{}_debugger_output.txt"'.format( result_id) return response elif 'get_aux_output' in request.GET: response = HttpResponse(result.aux_output, content_type='text/plain') response['Content-Disposition'] = \ 'attachment; filename="{}_aux_output.txt"'.format( result_id) return response elif 'get_output_file' in request.GET: response = get_file(result.campaign.output_file, result_id) response['Content-Disposition'] = \ 'attachment; filename={}_{}'.format( result_id, result.campaign.output_file) return response elif 'get_log_file' in request.GET: temp_file = TemporaryFile() with open_tar(fileobj=temp_file, mode='w:gz') as archive: for log_file in result.campaign.log_files: archive.add( 'campaign-data/{}/results/{}/{}'.format( result.campaign_id, result.id, log_file), '{}_{}'.format(result.id, log_file)) response = FileResponse(temp_file, content_type='application/x-compressed') response['Content-Disposition'] = \ 'attachment; filename={}_log_files.tar.gz'.format(result.id) response['Content-Length'] = temp_file.tell() temp_file.seek(0) return response campaign_items_ = [(item[0], '/campaign/{}/{}'.format(result.campaign_id, item[1]), item[2], item[3]) for item in campaign_items] if result.campaign.output_file: output_file = 'campaign-data/{}/results/{}/{}'.format( result.campaign_id, result_id, result.campaign.output_file) output_file = \ exists(output_file) and guess_type(output_file)[0] is not None else: output_file = False result_table = tables.result(models.result.objects.filter(id=result_id)) events = result.event_set.all() event_table = tables.event(events) if request.method == 'POST' and 'launch' in request.POST: Popen([ argv[0], '--campaign_id', str(result.campaign_id), 'regenerate', result_id ]) if request.method == 'POST' and 'save' in request.POST: result.outcome = request.POST['outcome'] result.outcome_category = request.POST['outcome_category'] result.save() elif request.method == 'POST' and 'delete' in request.POST: if exists('campaign-data/{}/results/{}'.format(result.campaign_id, result.id)): rmtree('campaign-data/{}/results/{}'.format( result.campaign_id, result.id)) result.delete() return HttpResponse('Result deleted') injections = result.injection_set.all() if result.campaign.simics: if injections.count(): injection_table = tables.injection(injections) else: injection_table = None register_diffs = result.simics_register_diff_set.all() register_filter = filters.simics_register_diff(request.GET, queryset=register_diffs) register_diff_count = register_filter.qs.count() register_table = tables.simics_register_diff(register_filter.qs) RequestConfig(request, paginate={ 'per_page': table_length }).configure(register_table) memory_diffs = result.simics_memory_diff_set.all() memory_diff_count = memory_diffs.count() memory_table = tables.simics_memory_diff(memory_diffs) RequestConfig(request, paginate={ 'per_page': table_length }).configure(memory_table) else: register_filter = None memory_diff_count = 0 memory_table = None register_diff_count = 0 register_table = None if injections.count(): injection_table = tables.injection(injections) else: injection_table = None RequestConfig(request, paginate=False).configure(result_table) RequestConfig(request, paginate=False).configure(event_table) if injection_table: RequestConfig(request, paginate=False).configure(injection_table) return render( request, 'result.html', { 'campaign_items': campaign_items_, 'event_count': '{:,}'.format(events.count()), 'event_table': event_table, 'filter': register_filter, 'injection_table': injection_table, 'memory_diff_count': '{:,}'.format(memory_diff_count), 'memory_table': memory_table, 'navigation_items': navigation_items, 'output_file': output_file, 'register_diff_count': '{:,}'.format(register_diff_count), 'register_table': register_table, 'result': result, 'result_table': result_table })
class ZipNumClusterJob(MRJob): HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.CombineTextInputFormat' PARTITIONER = 'org.apache.hadoop.mapred.lib.TotalOrderPartitioner' INPUT_PROTOCOL = RawValueProtocol OUTPUT_PROTOCOL = RawValueProtocol INTERNAL_PROTOCOL = RawProtocol JOBCONF = {'mapreduce.task.timeout': '9600000', 'mapreduce.input.fileinputformat.split.maxsize': '50000000', 'mapreduce.map.speculative': 'false', 'mapreduce.reduce.speculative': 'false', 'mapreduce.output.fileoutputformat.compress': 'false', 'mapreduce.job.reduce.slowstart.completedmaps': '1.0', 'mapreduce.job.jvm.numtasks': '-1' } def configure_args(self): """Custom command line options for indexing""" super(ZipNumClusterJob, self).configure_args() self.add_passthru_arg('--numlines', dest='numlines', type=int, default=3000, help='Number of lines per gzipped block') self.add_passthru_arg('--splitfile', dest='splitfile', help='Split file to use for CDX shard split') self.add_passthru_arg('--convert', dest='convert', action='store_true', default=False, help='Convert CDX through _convert_line() function') self.add_passthru_arg('--shards', dest='shards', type=int, help='Num ZipNum Shards to create, ' + '= num of entries in splits + 1' + '= num of reducers used') self.add_passthru_arg('--s3-upload-acl', dest='s3acl', help='S3 access permissions (ACL) to be applied to CDX files') def jobconf(self): orig_jobconf = super(ZipNumClusterJob, self).jobconf() custom_jobconf = {'mapreduce.job.reduces': self.options.shards, 'mapreduce.totalorderpartitioner.path': self.options.splitfile} combined = combine_dicts(orig_jobconf, custom_jobconf) return combined def mapper_init(self): pass def mapper(self, _, line): line = line.split('\t')[-1] if not line.startswith(' CDX'): if self.options.convert: line = self._convert_line(line) yield line, '' def _convert_line(self, line): key, ts, url, length, offset, warc = line.split(' ') key = key.replace(')', ',)', 1) vals = {'o': offset, 's': length, 'w': warc, 'u': url} return key + ' ' + ts + ' ' + json.dumps(vals) def _get_prop(self, proplist): for p in proplist: res = os.environ.get(p) if res: return res def reducer_init(self): self.curr_lines = [] self.curr_key = '' self.part_num = self._get_prop(['mapreduce_task_partition', 'mapred_task_partition']) assert(self.part_num) self.part_name = 'cdx-%05d.gz' % int(self.part_num) self.output_dir = self._get_prop(['mapreduce_output_fileoutputformat_outputdir', 'mapred.output.dir', 'mapred_work_output_dir']) assert(self.output_dir) self.gzip_temp = TemporaryFile(mode='w+b') def reducer(self, key, values): if key: self.curr_lines.append(key) for x in values: if x: self.curr_lines.append(x) if len(self.curr_lines) == 1: self.curr_key = ' '.join(key.split(' ', 2)[0:2]) if len(self.curr_lines) >= self.options.numlines: yield '', self._write_part() def reducer_final(self): if len(self.curr_lines) > 0: yield '', self._write_part() self._do_upload() def _do_upload(self): self.gzip_temp.flush() self.gzip_temp.seek(0) #TODO: move to generalized put() function if self.output_dir.startswith('s3://') or self.output_dir.startswith('s3a://'): import boto3 import botocore boto_config = botocore.client.Config( read_timeout=180, retries={'max_attempts' : 20}) s3client = boto3.client('s3', config=boto_config) s3args = None if self.options.s3acl: s3args = {'ACL': self.options.s3acl} parts = urlparse.urlsplit(self.output_dir) s3key = parts.path.strip('/') + '/' + self.part_name s3url = parts.scheme + '://' + parts.netloc + '/' + s3key LOG.info('Uploading index to ' + s3url) try: s3client.upload_fileobj(self.gzip_temp, parts.netloc, s3key, ExtraArgs=s3args) except botocore.client.ClientError as exception: LOG.error('Failed to upload {}: {}'.format(s3url, exception)) return LOG.info('Successfully uploaded index file: ' + s3url) else: path = os.path.join(self.output_dir, self.part_name) with open(path, 'w+b') as target: shutil.copyfileobj(self.gzip_temp, target) self.gzip_temp.close() def _write_part(self): z = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16) offset = self.gzip_temp.tell() buff = '\n'.join(self.curr_lines) + '\n' self.curr_lines = [] buff = z.compress(buff) self.gzip_temp.write(buff) buff = z.flush() self.gzip_temp.write(buff) self.gzip_temp.flush() length = self.gzip_temp.tell() - offset partline = '{0}\t{1}\t{2}\t{3}'.format(self.curr_key, self.part_name, offset, length) return partline