def release(self): # detach us from our rollups Archive.objects.filter(rollup=self).update(rollup=None) # delete our archive file from storage if self.url: bucket, key = self.get_storage_location() s3.client().delete_object(Bucket=bucket, Key=key) # and lastly delete ourselves self.delete()
def rewrite(self, transform, delete_old=False): s3_client = s3.client() bucket, key = self.get_storage_location() s3_obj = s3_client.get_object(Bucket=bucket, Key=key) old_file = s3_obj["Body"] new_file = tempfile.TemporaryFile() new_hash, new_size = jsonlgz_rewrite(old_file, new_file, transform) new_file.seek(0) match = KEY_PATTERN.match(key) new_key = f"{self.org.id}/{match.group('type')}_{match.group('period')}_{new_hash.hexdigest()}.jsonl.gz" new_url = f"https://{bucket}.s3.amazonaws.com/{new_key}" new_hash_base64 = base64.standard_b64encode(new_hash.digest()).decode() s3_client.put_object( Bucket=bucket, Key=new_key, Body=new_file, ContentType="application/json", ContentEncoding="gzip", ACL="private", ContentMD5=new_hash_base64, Metadata={"md5chksum": new_hash_base64}, ) self.url = new_url self.hash = new_hash.hexdigest() self.size = new_size self.save(update_fields=("url", "hash", "size")) if delete_old: s3_client.delete_object(Bucket=bucket, Key=key)
def iter_records(self, *, where: dict = None): """ Creates an iterator for the records in this archive, streaming and decompressing on the fly """ s3_client = s3.client() if where: bucket, key = self.get_storage_location() response = s3_client.select_object_content( Bucket=bucket, Key=key, ExpressionType="SQL", Expression=s3.compile_select(where=where), InputSerialization={ "CompressionType": "GZIP", "JSON": { "Type": "LINES" } }, OutputSerialization={"JSON": { "RecordDelimiter": "\n" }}, ) def generator(): for record in EventStreamReader(response["Payload"]): yield record return generator() else: bucket, key = self.get_storage_location() s3_obj = s3_client.get_object(Bucket=bucket, Key=key) return jsonlgz_iterate(s3_obj["Body"])
def release_org_archives(cls, org): """ Deletes all the archives for an org, also iterating any remaining files in S3 and removing that path as well. """ # release all of our archives in turn for archive in Archive.objects.filter(org=org): archive.release() # find any remaining S3 files and remove them for this org s3_client = s3.client() archive_files = s3_client.list_objects_v2( Bucket=settings.ARCHIVE_BUCKET, Prefix=f"{org.id}/").get("Contents", []) for archive_file in archive_files: s3_client.delete_object(Bucket=settings.ARCHIVE_BUCKET, Key=archive_file["Key"])
def get_download_link(self): if self.url: s3_client = s3.client() bucket, key = self.get_storage_location() s3_params = { "Bucket": bucket, "Key": key, # force browser to download and not uncompress our gzipped files "ResponseContentDisposition": "attachment;", "ResponseContentType": "application/octet", "ResponseContentEncoding": "none", } return s3_client.generate_presigned_url( "get_object", Params=s3_params, ExpiresIn=Archive.DOWNLOAD_EXPIRES) else: return ""
def handle(self, org_id: int, archive_type: str, fix: bool, run_counts: bool, **options): org = Org.objects.filter(id=org_id).first() if not org: raise CommandError(f"No such org with id {org_id}") self.stdout.write( f"Auditing {archive_type} archives for org '{org.name}'...") s3_client = s3.client() flow_run_counts = defaultdict(int) for archive in Archive._get_covering_period(org, archive_type): bucket, key = archive.get_storage_location() s3_obj = s3_client.get_object(Bucket=bucket, Key=key) stream = gzip.GzipFile(fileobj=s3_obj["Body"]) num_records = 0 num_too_long = 0 for line in stream: num_records += 1 record = line.decode("utf-8") if len(record) > S3_RECORD_MAX_CHARS: num_too_long += 1 if archive_type == Archive.TYPE_FLOWRUN and run_counts: parsed = json.loads(record) flow_run_counts[parsed["flow"]["uuid"]] += 1 self.stdout.write( f" > id={archive.id} start_date={archive.start_date.isoformat()} bucket={bucket} key={key} " f"records={num_records} num_too_long={num_too_long}") if archive.record_count != num_records: self.stdout.write( f" ⚠️ record count mismatch, db={archive.record_count} file={num_records}" ) if num_too_long > 0 and archive_type == Archive.TYPE_FLOWRUN and fix: self.fix_run_archive(archive) if archive_type == Archive.TYPE_FLOWRUN and run_counts: flows = org.flows.filter(is_active=True, is_system=False) self.stdout.write( f"Checking flow run counts for {flows.count()} flows...") db_counts = org.runs.values("flow_id").annotate( count=Count("id")).order_by("flow_id") db_counts = {f["flow_id"]: f["count"] for f in db_counts} for flow in flows.order_by("-created_on"): squashed_count = flow.get_run_stats()["total"] db_count = db_counts.get(flow.id, 0) archive_count = flow_run_counts[str(flow.uuid)] if squashed_count != (db_count + archive_count): self.stdout.write( f" ⚠️ count mismatch for flow '{flow.name}' ({flow.uuid}) " f"squashed={squashed_count} db={db_count} archives={archive_count}" )