Exemplo n.º 1
0
    def get_latest_aws_manifest_key(self):
        manifestprefix = self.sourcePrefix + utils.get_period_prefix(
            self.year, self.month)
        print "Getting Manifest key for acccount:[{}] - bucket:[{}] - prefix:[{}]".format(
            self.accountId, self.sourceBucket, manifestprefix)
        manifest_key = ''
        try:
            response = self.s3sourceclient.list_objects_v2(
                Bucket=self.sourceBucket, Prefix=manifestprefix)
            #Get the latest manifest
            if 'Contents' in response:
                for o in response['Contents']:
                    key = o['Key']
                    post_prefix = key[key.find(manifestprefix) +
                                      len(manifestprefix):]
                    if '-Manifest.json' in key and post_prefix.find(
                            "/"
                    ) < 0:  #manifest file is at top level after prefix and not inside one of the folders
                        manifest_key = key
                        break
        except Exception as e:
            print "Error when getting manifest key for acccount:[{}] - bucket:[{}] - key:[{}]".format(
                self.accountId, self.sourceBucket, manifest_key)
            print e.message
            traceback.print_exc()

        if not manifest_key:
            raise ManifestNotFoundError(
                "Could not find manifest file in bucket:[{}], key:[{}]".format(
                    self.sourceBucket, manifest_key))

        return manifest_key
 def test_role(self):
     monthly_report_prefix = ""
     if self.year and self.month:
         monthly_report_prefix = utils.get_period_prefix(
             self.year, self.month)
     latest_report_keys = self.get_latest_aws_cur_keys(
         self.sourceBucket, self.sourcePrefix + monthly_report_prefix,
         self.s3sourceclient)
     if latest_report_keys:
         print "xAccount Source test passed!"
    def create_manifest(self, type, bucket, prefix, report_keys):

        monthly_report_prefix = ""
        if self.year and self.month:
            monthly_report_prefix = utils.get_period_prefix(
                self.year, self.month)

        manifest = {}

        #report_keys can by any array of keys. If it's not provided, then we get the ones generated by AWS
        if not report_keys:
            report_keys = self.get_latest_aws_cur_keys(
                bucket, prefix + monthly_report_prefix, self.s3destclient)

        entries = []
        uris = []
        for key in report_keys:
            #TODO: manifest cannot point to more than 1000 files (add validation)
            uris.append("s3://" + bucket + "/" + key)
            if type == consts.MANIFEST_TYPE_REDSHIFT:
                entries.append({
                    "url": "s3://" + bucket + "/" + key,
                    "mandatory": True
                })
            if len(entries) == self.limit: break

        manifest_file_name = ""
        if type == consts.MANIFEST_TYPE_REDSHIFT:
            manifest['entries'] = entries
            manifest_file_name = "billing-redshift-manifest-concurrencylabs.json"

        if type == consts.MANIFEST_TYPE_QUICKSIGHT:
            manifest['fileLocations'] = [{"URIs": uris}]
            manifest_file_name = "billing-quicksight-manifest-concurrencylabs.json"

        manifest_body = json.dumps(manifest, indent=4, sort_keys=False)
        print("Manifest ({}):{}".format(type, manifest_body))
        record_count = 0
        if len(uris): record_count = len(uris)
        if len(entries): record_count = len(entries)
        print "Number of files in manifest: [{}]".format(record_count)

        #TODO: validate that no athena files exist in S3 destination, before creating manifest
        manifest_key = prefix + monthly_report_prefix + manifest_file_name
        if record_count:
            self.s3destclient.put_object(Bucket=bucket,
                                         Key=manifest_key,
                                         ACL='private',
                                         Body=manifest_body)
            print "Manifest S3 URL (this is the URL you provide in {}): [https://s3.amazonaws.com/{}/{}]".format(
                type, bucket, manifest_key)
        else:
            print "No entries found - did not write manifest"
    def get_all_aws_manifest_keys(self):
        manifestprefix = self.sourcePrefix + utils.get_period_prefix(
            self.year, self.month)
        print "Getting Manifest key for acccount:[{}] - bucket:[{}] - prefix:[{}]".format(
            self.accountId, self.sourceBucket, manifestprefix)
        manifest_key = []
        try:
            response = self.s3sourceclient.list_objects_v2(
                Bucket=self.sourceBucket, Prefix=manifestprefix)  #recursive
            #Get the latest manifest
            if 'Contents' in response:
                for o in response['Contents']:
                    key = o['Key']
                    post_prefix = key[key.find(manifestprefix) +
                                      len(manifestprefix):]
                    #if '-Manifest.json' in key and post_prefix.find("/") < 0:#manifest file is at top level after prefix and not inside one of the folders
                    if '-Manifest.json' in key:  # Using the first manifest file for the month. how to deal with manifest change in between a month
                        manifest_key.append(key)
                        #break

        except BotoClientError as bce:
            self.status = consts.CUR_PROCESSOR_STATUS_ERROR
            if bce.response['Error']['Code'] == 'NoSuchBucket':
                self.statusDetails = bce.response['Error']['Code']
                raise CurBucketNotFoundError("{} - bucket:[{}]".format(
                    bce.message, self.sourceBucket))
            else:
                self.statusDetails = 'BotoClientError_' + bce.response[
                    'Error']['Code']
                raise

        except Exception as e:
            self.status = consts.CUR_PROCESSOR_STATUS_ERROR
            self.statusDetails = e.message
            print "Error when getting manifest key for acccount:[{}] - bucket:[{}] - key:[{}]".format(
                self.accountId, self.sourceBucket, manifest_key)
            print e.message
            traceback.print_exc()

        if not manifest_key:
            self.status = consts.CUR_PROCESSOR_STATUS_ERROR
            self.statusDetails = "ManifestNotFoundError - key:[{}]".format(
                manifest_key)
            raise ManifestNotFoundError(
                "Could not find manifest file in bucket:[{}]".format(
                    self.sourceBucket))

        print "Manifest Keys: [{}]".format(manifest_key)
        return manifest_key
    def process_latest_aws_cur(self, action):

        if action in (consts.ACTION_PREPARE_ATHENA,
                      consts.ACTION_PREPARE_QUICKSIGHT):
            if not utils.is_valid_prefix(self.destPrefix):
                raise Exception(
                    "Invalid Destination S3 Bucket prefix: [{}]".format(
                        self.destPrefix))

        period_prefix = utils.get_period_prefix(self.year, self.month)
        monthSourcePrefix = self.sourcePrefix + period_prefix
        monthDestPrefix = '{}{}/{}'.format(self.destPrefix, self.accountId,
                                           period_prefix)
        report_keys = self.get_latest_aws_cur_keys(self.sourceBucket,
                                                   monthSourcePrefix,
                                                   self.s3sourceclient)
        destS3keys = []

        #Get content for all report files
        for rk in report_keys:

            tokens = rk.split("/")
            hash = tokens[len(tokens) - 2]

            response = self.s3sourceclient.get_object(Bucket=self.sourceBucket,
                                                      Key=rk)
            if '/var/task' in os.getcwd():  #executing as a Lambda function
                tmpLocalFolder = '/tmp'
            else:
                tmpLocalFolder = os.getcwd() + '/tmp'

            if not os.path.isdir(tmpLocalFolder): os.mkdir(tmpLocalFolder)
            tmpLocalKey = tmpLocalFolder + '/tmp_' + rk.replace(
                "/", "-"
            ) + '.csv.gz'  #temporary file that is downloaded from S3, before any modifications take place
            finalLocalKey = tmpLocalFolder + '/' + hash + '.csv.gz'  #final local file after any modifications take place
            fileToUpload = ''
            finalS3Key = ''

            #Download latest report as a tmp local file
            with open(tmpLocalKey, 'wb') as report:
                self.s3resource.Bucket(self.sourceBucket).download_fileobj(
                    rk, report)

            #Read through the tmp local file and skip first line (for Athena)
            record_count = 0
            if action == consts.ACTION_PREPARE_ATHENA:
                fileToUpload = finalLocalKey
                finalS3Key = monthDestPrefix + str(
                    uuid.uuid1()) + "cost-and-usage-athena.csv.gz"
                with gzip.open(tmpLocalKey, 'rb') as f:
                    f.next()  #skips first line for Athena files
                    #Write contents to another tmp file, which will be uploaded to S3
                    with gzip.open(finalLocalKey, 'ab') as no_header:
                        for line in f:
                            no_header.write(line)
                            record_count = record_count + 1

                print "Number of records: [{}]".format(record_count)

            #TODO:if we're using the files for QuickSight, do a Copy operation and don't download.
            if action == consts.ACTION_PREPARE_QUICKSIGHT:
                fileToUpload = tmpLocalKey
                finalS3Key = monthDestPrefix + "cost-and-usage-quicksight.csv.gz"

            print "Putting: [{}/{}] in [{}/{}]".format(self.sourceBucket, rk,
                                                       self.destBucket,
                                                       finalS3Key)

            with open(fileToUpload, 'rb') as data:
                self.s3destclient.upload_fileobj(data,
                                                 self.destBucket,
                                                 finalS3Key,
                                                 ExtraArgs={
                                                     'Metadata': {
                                                         'reportId': hash
                                                     },
                                                     'StorageClass':
                                                     'REDUCED_REDUNDANCY'
                                                 })
                destS3keys.append(finalS3Key)

            #Remove temporary files. This is also important to avoid Lambda errors where the local Lambda storage limit can be easily reached after a few executions
            os.remove(tmpLocalKey)
            os.remove(finalLocalKey)

        self.status = consts.CUR_PROCESSOR_STATUS_OK

        return destS3keys