示例#1
0
    def get_all_lines(self, encoding='utf8', flexible=False):
        try:
            iterator = self.raw.stream(4096, decode_content=False)

            if self.headers.get('content-encoding') == 'gzip':
                return ibytes2ilines(icompressed2ibytes(iterator),
                                     encoding=encoding,
                                     flexible=flexible)
            elif self.headers.get('content-type') in [
                    mimetype.ZIP, mimetype.GZIP
            ]:
                return ibytes2ilines(icompressed2ibytes(iterator),
                                     encoding=encoding,
                                     flexible=flexible)
            elif self.url.endswith('.gz'):
                return ibytes2ilines(icompressed2ibytes(iterator),
                                     encoding=encoding,
                                     flexible=flexible)
            else:
                return ibytes2ilines(iterator,
                                     encoding=encoding,
                                     flexible=flexible,
                                     closer=self.close)
        except Exception as e:
            Log.error(u"Can not read content", cause=e)
示例#2
0
    def read_lines(self, key):
        source = self.get_meta(key)
        if source is None:
            Log.error("{{key}} does not exist", key=key)
        if source.size < MAX_STRING_SIZE:
            if source.key.endswith(".gz"):
                return LazyLines(ibytes2ilines(scompressed2ibytes(source)))
            else:
                return source.read().decode('utf8').split("\n")

        if source.key.endswith(".gz"):
            return LazyLines(ibytes2ilines(scompressed2ibytes(source)))
        else:
            return LazyLines(source)
示例#3
0
    def write_lines(self, key, lines):
        self._verify_key_format(key)
        storage = self.bucket.new_key(str(key + ".json.gz"))

        if VERIFY_UPLOAD:
            lines = list(lines)

        with mo_files.TempFile() as tempfile:
            with open(tempfile.abspath, "wb") as buff:
                DEBUG and Log.note("Temp file {{filename}}",
                                   filename=tempfile.abspath)
                archive = gzip.GzipFile(filename=str(key + ".json"),
                                        fileobj=buff,
                                        mode="w")
                count = 0
                for l in lines:
                    if is_many(l):
                        for ll in l:
                            archive.write(ll.encode("utf8"))
                            archive.write(b"\n")
                            count += 1
                    else:
                        archive.write(l.encode("utf8"))
                        archive.write(b"\n")
                        count += 1
                archive.close()

            retry = 3
            while retry:
                try:
                    with Timer(
                            "Sending {{count}} lines in {{file_length|comma}} bytes for {{key}}",
                        {
                            "key": key,
                            "file_length": tempfile.length,
                            "count": count
                        },
                            verbose=self.settings.debug,
                    ):
                        storage.set_contents_from_filename(
                            tempfile.abspath,
                            headers={"Content-Type": mimetype.GZIP})
                    break
                except Exception as e:
                    e = Except.wrap(e)
                    retry -= 1
                    if (retry == 0 or "Access Denied" in e
                            or "No space left on device" in e):
                        Log.error("could not push data to s3", cause=e)
                    else:
                        Log.warning("could not push data to s3, will retry",
                                    cause=e)

            if self.settings.public:
                storage.set_acl("public-read")

            if VERIFY_UPLOAD:
                try:
                    with open(tempfile.abspath, mode="rb") as source:
                        result = list(ibytes2ilines(
                            scompressed2ibytes(source)))
                        assertAlmostEqual(result,
                                          lines,
                                          msg="file is different")

                    # full_url = "https://"+self.name+".s3-us-west-2.amazonaws.com/"+storage.key.replace(":", "%3A")
                    # https://active-data-test-result.s3-us-west-2.amazonaws.com/tc.1524896%3A152488763.0.json.gz

                    # dest_bucket = s3.MultiBucket(bucket="self.name", kwargs=self.settings.aws)

                    result = list(self.read_lines(strip_extension(key)))
                    assertAlmostEqual(result,
                                      lines,
                                      result,
                                      msg="S3 is different")

                except Exception as e:
                    from activedata_etl.transforms import TRY_AGAIN_LATER

                    Log.error(TRY_AGAIN_LATER,
                              reason="did not pass verification",
                              cause=e)
        return