예제 #1
0
파일: models.py 프로젝트: Ilhasoft/rapidpro
    def release(self):

        # detach us from our rollups
        Archive.objects.filter(rollup=self).update(rollup=None)

        # delete our archive file from storage
        if self.url:
            bucket, key = self.get_storage_location()
            s3.client().delete_object(Bucket=bucket, Key=key)

        # and lastly delete ourselves
        self.delete()
예제 #2
0
파일: models.py 프로젝트: Ilhasoft/rapidpro
    def rewrite(self, transform, delete_old=False):
        s3_client = s3.client()
        bucket, key = self.get_storage_location()

        s3_obj = s3_client.get_object(Bucket=bucket, Key=key)
        old_file = s3_obj["Body"]

        new_file = tempfile.TemporaryFile()
        new_hash, new_size = jsonlgz_rewrite(old_file, new_file, transform)

        new_file.seek(0)

        match = KEY_PATTERN.match(key)
        new_key = f"{self.org.id}/{match.group('type')}_{match.group('period')}_{new_hash.hexdigest()}.jsonl.gz"
        new_url = f"https://{bucket}.s3.amazonaws.com/{new_key}"
        new_hash_base64 = base64.standard_b64encode(new_hash.digest()).decode()

        s3_client.put_object(
            Bucket=bucket,
            Key=new_key,
            Body=new_file,
            ContentType="application/json",
            ContentEncoding="gzip",
            ACL="private",
            ContentMD5=new_hash_base64,
            Metadata={"md5chksum": new_hash_base64},
        )

        self.url = new_url
        self.hash = new_hash.hexdigest()
        self.size = new_size
        self.save(update_fields=("url", "hash", "size"))

        if delete_old:
            s3_client.delete_object(Bucket=bucket, Key=key)
예제 #3
0
파일: models.py 프로젝트: Ilhasoft/rapidpro
    def iter_records(self, *, where: dict = None):
        """
        Creates an iterator for the records in this archive, streaming and decompressing on the fly
        """

        s3_client = s3.client()

        if where:
            bucket, key = self.get_storage_location()
            response = s3_client.select_object_content(
                Bucket=bucket,
                Key=key,
                ExpressionType="SQL",
                Expression=s3.compile_select(where=where),
                InputSerialization={
                    "CompressionType": "GZIP",
                    "JSON": {
                        "Type": "LINES"
                    }
                },
                OutputSerialization={"JSON": {
                    "RecordDelimiter": "\n"
                }},
            )

            def generator():
                for record in EventStreamReader(response["Payload"]):
                    yield record

            return generator()

        else:
            bucket, key = self.get_storage_location()
            s3_obj = s3_client.get_object(Bucket=bucket, Key=key)
            return jsonlgz_iterate(s3_obj["Body"])
예제 #4
0
파일: models.py 프로젝트: Ilhasoft/rapidpro
    def release_org_archives(cls, org):
        """
        Deletes all the archives for an org, also iterating any remaining files in S3 and removing that path
        as well.
        """
        # release all of our archives in turn
        for archive in Archive.objects.filter(org=org):
            archive.release()

        # find any remaining S3 files and remove them for this org
        s3_client = s3.client()
        archive_files = s3_client.list_objects_v2(
            Bucket=settings.ARCHIVE_BUCKET,
            Prefix=f"{org.id}/").get("Contents", [])
        for archive_file in archive_files:
            s3_client.delete_object(Bucket=settings.ARCHIVE_BUCKET,
                                    Key=archive_file["Key"])
예제 #5
0
파일: models.py 프로젝트: Ilhasoft/rapidpro
    def get_download_link(self):
        if self.url:
            s3_client = s3.client()
            bucket, key = self.get_storage_location()
            s3_params = {
                "Bucket": bucket,
                "Key": key,
                # force browser to download and not uncompress our gzipped files
                "ResponseContentDisposition": "attachment;",
                "ResponseContentType": "application/octet",
                "ResponseContentEncoding": "none",
            }

            return s3_client.generate_presigned_url(
                "get_object",
                Params=s3_params,
                ExpiresIn=Archive.DOWNLOAD_EXPIRES)
        else:
            return ""
예제 #6
0
    def handle(self, org_id: int, archive_type: str, fix: bool,
               run_counts: bool, **options):
        org = Org.objects.filter(id=org_id).first()
        if not org:
            raise CommandError(f"No such org with id {org_id}")

        self.stdout.write(
            f"Auditing {archive_type} archives for org '{org.name}'...")

        s3_client = s3.client()
        flow_run_counts = defaultdict(int)

        for archive in Archive._get_covering_period(org, archive_type):
            bucket, key = archive.get_storage_location()
            s3_obj = s3_client.get_object(Bucket=bucket, Key=key)
            stream = gzip.GzipFile(fileobj=s3_obj["Body"])

            num_records = 0
            num_too_long = 0

            for line in stream:
                num_records += 1
                record = line.decode("utf-8")
                if len(record) > S3_RECORD_MAX_CHARS:
                    num_too_long += 1

                if archive_type == Archive.TYPE_FLOWRUN and run_counts:
                    parsed = json.loads(record)
                    flow_run_counts[parsed["flow"]["uuid"]] += 1

            self.stdout.write(
                f" > id={archive.id} start_date={archive.start_date.isoformat()} bucket={bucket} key={key} "
                f"records={num_records} num_too_long={num_too_long}")

            if archive.record_count != num_records:
                self.stdout.write(
                    f"   ⚠️ record count mismatch, db={archive.record_count} file={num_records}"
                )

            if num_too_long > 0 and archive_type == Archive.TYPE_FLOWRUN and fix:
                self.fix_run_archive(archive)

        if archive_type == Archive.TYPE_FLOWRUN and run_counts:
            flows = org.flows.filter(is_active=True, is_system=False)
            self.stdout.write(
                f"Checking flow run counts for {flows.count()} flows...")

            db_counts = org.runs.values("flow_id").annotate(
                count=Count("id")).order_by("flow_id")
            db_counts = {f["flow_id"]: f["count"] for f in db_counts}

            for flow in flows.order_by("-created_on"):
                squashed_count = flow.get_run_stats()["total"]
                db_count = db_counts.get(flow.id, 0)
                archive_count = flow_run_counts[str(flow.uuid)]

                if squashed_count != (db_count + archive_count):
                    self.stdout.write(
                        f" ⚠️ count mismatch for flow '{flow.name}' ({flow.uuid}) "
                        f"squashed={squashed_count} db={db_count} archives={archive_count}"
                    )