Пример #1
0
 def _determin_s3_path_for_gcp(self, file_type, gcp_file_name):
     """Determine the s3 path based off of the invoice month."""
     invoice_month = gcp_file_name.split("_")[0]
     dh = DateHelper()
     start_of_invoice = dh.gcp_invoice_month_start(invoice_month)
     if file_type == DAILY_FILE_TYPE:
         report_type = self.report_type
         if report_type is None:
             report_type = "raw"
         return get_path_prefix(
             self.account,
             self.provider_type,
             self.provider_uuid,
             start_of_invoice,
             Config.PARQUET_DATA_TYPE,
             report_type=report_type,
             daily=True,
         )
     else:
         if self.report_type == OPENSHIFT_REPORT_TYPE:
             return get_path_prefix(
                 self.account,
                 self.provider_type,
                 self.provider_uuid,
                 start_of_invoice,
                 Config.PARQUET_DATA_TYPE,
                 report_type=self.report_type,
                 daily=True,
             )
         else:
             return get_path_prefix(
                 self.account, self.provider_type, self.provider_uuid, start_of_invoice, Config.PARQUET_DATA_TYPE
             )
Пример #2
0
    def _get_or_create_cost_entry_bill(self, row, report_db_accessor):
        """Get or Create a GCP cost entry bill object.

        Args:
            row (OrderedDict): A dictionary representation of a CSV file row.

        Returns:
             (string) An id of a GCP Bill.

        """
        table_name = GCPCostEntryBill
        dh = DateHelper()
        invoice_month = row["invoice.month"]
        start_time = dh.gcp_invoice_month_start(invoice_month)
        report_date_range = utils.month_date_range(start_time)
        start_date, end_date = report_date_range.split("-")

        start_date_utc = parser.parse(start_date).replace(hour=0,
                                                          minute=0,
                                                          tzinfo=pytz.UTC)
        end_date_utc = parser.parse(end_date).replace(hour=0,
                                                      minute=0,
                                                      tzinfo=pytz.UTC)

        billing_period_start = datetime.strftime(start_date_utc,
                                                 "%Y-%m-%d %H:%M%z")

        key = (billing_period_start, self._provider_uuid)
        if key in self.existing_bill_map:
            return self.existing_bill_map[key]

        data = {
            "billing_period_start":
            billing_period_start,
            "billing_period_end":
            datetime.strftime(end_date_utc, "%Y-%m-%d %H:%M%z"),
            "provider_id":
            self._provider_uuid,
        }

        key = (start_date_utc, self._provider_uuid)
        if key in self.processed_report.bills:
            return self.processed_report.bills[key]

        if key in self.existing_bill_map:
            return self.existing_bill_map[key]

        with transaction.atomic():
            bill_id = report_db_accessor.insert_on_conflict_do_nothing(
                table_name,
                data,
                conflict_columns=["billing_period_start", "provider_id"])
            self.processed_report.bills[key] = bill_id

        return bill_id
Пример #3
0
    def _generate_monthly_pseudo_manifest(self, start_date):
        """
        Generate a dict representing an analog to other providers' "manifest" files.

        GCP does not produce a manifest file for monthly periods. So, we check for
        files in the bucket that match dates within the monthly period starting on
        the requested start_date.

        Args:
            start_date (datetime.datetime): when to start gathering reporting data

        Returns:
            Manifest-like dict with list of relevant found files.

        """
        etag = None
        invoice_month = start_date.strftime("%Y%m")
        etags = self.file_mapping.get(str(invoice_month), {})
        for etag_key in etags.keys():
            with ReportManifestDBAccessor() as manifest_accessor:
                assembly_id = ":".join(
                    [str(self._provider_uuid), etag_key,
                     str(invoice_month)])
                manifest = manifest_accessor.get_manifest(
                    assembly_id, self._provider_uuid)
            if manifest:
                continue
            etag = etag_key
            break
        if not etag:
            return {}
        dh = DateHelper()
        start_date = dh.gcp_invoice_month_start(str(invoice_month))
        end_date = self.file_mapping[invoice_month][etag]["end"]
        file_names = [self.file_mapping[invoice_month][etag]["filename"]]
        manifest_data = {
            "assembly_id": assembly_id,
            "compression": UNCOMPRESSED,
            "start_date": start_date,
            "end_date": end_date,  # inclusive end date
            "file_names": file_names,
        }
        return manifest_data
Пример #4
0
def create_daily_archives(tracing_id,
                          account,
                          provider_uuid,
                          filename,
                          filepath,
                          manifest_id,
                          start_date,
                          last_export_time,
                          context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        tracing_id (str): The tracing id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    download_hash = None
    daily_file_names = []
    if last_export_time:
        download_hash = hashlib.md5(str(last_export_time).encode())
        download_hash = download_hash.hexdigest()
    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(
            provider_uuid, Provider.PROVIDER_GCP, account):
        dh = DateHelper()
        directory = os.path.dirname(filepath)
        try:
            data_frame = pd.read_csv(filepath)
        except Exception as error:
            LOG.error(
                f"File {filepath} could not be parsed. Reason: {str(error)}")
            raise error
        for invoice_month in data_frame["invoice.month"].unique():
            # daily_files = []
            invoice_filter = data_frame["invoice.month"] == invoice_month
            invoice_data = data_frame[invoice_filter]
            unique_times = invoice_data.partition_date.unique()
            days = list({cur_dt[:10] for cur_dt in unique_times})
            daily_data_frames = [{
                "data_frame":
                invoice_data[invoice_data.partition_date.str.contains(
                    cur_day)],
                "date":
                cur_day
            } for cur_day in days]
            start_of_invoice = dh.gcp_invoice_month_start(invoice_month)
            s3_csv_path = get_path_prefix(account, Provider.PROVIDER_GCP,
                                          provider_uuid, start_of_invoice,
                                          Config.CSV_DATA_TYPE)
            for daily_data in daily_data_frames:
                day = daily_data.get("date")
                df = daily_data.get("data_frame")
                if download_hash:
                    day_file = f"{invoice_month}_{day}_{download_hash}.csv"
                else:
                    day_file = f"{invoice_month}_{day}.csv"
                day_filepath = f"{directory}/{day_file}"
                df.to_csv(day_filepath, index=False, header=True)
                copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path,
                                                    day_filepath, day_file,
                                                    manifest_id, start_date,
                                                    context)
                daily_file_names.append(day_filepath)
        return daily_file_names