def read_lines(self, key): source = self.get_meta(key) if source is None: Log.error("{{key}} does not exist", key=key) if source.size < MAX_STRING_SIZE: if source.key.endswith(".gz"): return LazyLines(ibytes2ilines(scompressed2ibytes(source))) else: return source.read().decode('utf8').split("\n") if source.key.endswith(".gz"): return LazyLines(ibytes2ilines(scompressed2ibytes(source))) else: return LazyLines(source)
def write_lines(self, key, lines): self._verify_key_format(key) storage = self.bucket.new_key(str(key + ".json.gz")) if VERIFY_UPLOAD: lines = list(lines) with mo_files.TempFile() as tempfile: with open(tempfile.abspath, "wb") as buff: DEBUG and Log.note("Temp file {{filename}}", filename=tempfile.abspath) archive = gzip.GzipFile(filename=str(key + ".json"), fileobj=buff, mode="w") count = 0 for l in lines: if is_many(l): for ll in l: archive.write(ll.encode("utf8")) archive.write(b"\n") count += 1 else: archive.write(l.encode("utf8")) archive.write(b"\n") count += 1 archive.close() retry = 3 while retry: try: with Timer( "Sending {{count}} lines in {{file_length|comma}} bytes for {{key}}", { "key": key, "file_length": tempfile.length, "count": count }, verbose=self.settings.debug, ): storage.set_contents_from_filename( tempfile.abspath, headers={"Content-Type": mimetype.GZIP}) break except Exception as e: e = Except.wrap(e) retry -= 1 if (retry == 0 or "Access Denied" in e or "No space left on device" in e): Log.error("could not push data to s3", cause=e) else: Log.warning("could not push data to s3, will retry", cause=e) if self.settings.public: storage.set_acl("public-read") if VERIFY_UPLOAD: try: with open(tempfile.abspath, mode="rb") as source: result = list(ibytes2ilines( scompressed2ibytes(source))) assertAlmostEqual(result, lines, msg="file is different") # full_url = "https://"+self.name+".s3-us-west-2.amazonaws.com/"+storage.key.replace(":", "%3A") # https://active-data-test-result.s3-us-west-2.amazonaws.com/tc.1524896%3A152488763.0.json.gz # dest_bucket = s3.MultiBucket(bucket="self.name", kwargs=self.settings.aws) result = list(self.read_lines(strip_extension(key))) assertAlmostEqual(result, lines, result, msg="S3 is different") except Exception as e: from activedata_etl.transforms import TRY_AGAIN_LATER Log.error(TRY_AGAIN_LATER, reason="did not pass verification", cause=e) return