Пример #1
0
class ADH(object):
  """Run ADH queries

  This class runs ADH queries. Where they output is determined by the query iteslf, in ADH. All
  we can specify here is the date range - and we do this by accepting a lookback window and doing
  "yesterday - lookback".

  TODO (davidharcombe@) ADH Query Parameters
  """
  
  def __init__(self, 
    email: str, project: str, adh_customer: str,
    adh_query: str, api_key: str, days: int,
    dest_project: str=None, dest_dataset: str=None):
    """Constructor

    Setus up the ADH helper
    
    Arguments:
        email {str} -- authenticated user email (for the token)
        project {str} -- GCP project
        adh_customer {str} -- ADH customer id, 9-digit number, NO DASHES
        adh_query {str} -- ADH query id
        api_key {str} -- API Key (has to be set up in APIs and Libraries in GCP)
        days {int} -- Lookback window (default: 60)
        dest_project {str} -- target GCP project for results
        dest_dataset {str} -- target BQ dataset for results
    """
    self.email = email
    self.project = project
    self.adh_customer = adh_customer
    self.adh_query = adh_query
    self.api_key = api_key
    self.days = days
    self.dest_project = dest_project
    self.dest_dataset = dest_dataset

    self.credentials = Credentials(email=email, project=project)
    self.storage = Cloud_Storage(email=email, project=project)
    self.firestore = Firestore(email=email, project=project)


  def run(self, unattended: bool=True):
    """Run the ADH query
    
    Execute the ADH query, storing the run job result in Firestore. The data itself will be written
    to Big Query by ADH.
    Remember that ADH queries have many, many constraints so use this wisely: DON'T set up
    an hourly run - check with ADH.

    Keyword Arguments:
        unattended {bool} -- run unattended. Unused, but there for compatibility (default: {True})
    """
    query_details = self.fetch_query_details()
    if query_details:
      report = {
        'id': self.adh_query,
        'details': query_details,
        'customer_id': self.adh_customer,
        'table_name': self._sanitize_string(query_details['title']),
      }
      if self.dest_project:
        report['dest_project'] = self.dest_project

      if self.dest_dataset:
        report['dest_dataset'] = self.dest_dataset

      self.firestore.store_report_config(
        type=Type.ADH,
        report_data=report,
        id=self.adh_query)
    
      result = self.run_query(report)
      report['last_run'] = result
      self.firestore.store_report_config(
        type=Type.ADH,
        report_data=report,
        id=self.adh_query)

      logging.info('Result: {result}'.format(result=result))


  def _get_adh_service(self) -> Resource:
    """Create the ADH Service

    Use the discovery API to create the ADH service
    
    Returns:
        Resource -- ADH service
    """
    adh_service = DiscoverService.get_service(Service.ADH, self.credentials, self.api_key)
    return adh_service


  def _sanitize_string(self, original: str) -> str:
    """Sanitize Strings

    Convert any non alphanumeric into an '_' as per BQ requirements
    
    Arguments:
        original {str} -- 
    
    Returns:
        str -- 
    """
    return re.sub('[^a-zA-Z0-9,]', '_', original)

    
  def fetch_query_details(self) -> Dict[str, Any]:
    """Get the Query details
    
    Returns:
        Dict[str, Any] -- [description]
    """
    service = self._get_adh_service()

    query_id = 'customers/{customer_id}/analysisQueries/{query_id}'.format(
      customer_id=self.adh_customer,
      query_id=self.adh_query)
    query = service.customers().analysisQueries().get(name=query_id).execute()

    return query


  def run_query(self, query_details: Dict[str, Any]) -> Dict[str, Any]:
    """Run the ADH query
    
    Arguments:
        query_details {Dict[str, Any]} -- the details of the query job
    
    Returns:
        Dict[str, Any] -- result of the query run directive
    """
    service = self._get_adh_service()

    yesterday = datetime.now(tz=pytz.timezone('US/Eastern')) - timedelta(days=1)
    earliest = yesterday - timedelta(days=60)

    body = {
      "spec": {
        "startDate": {
          "year": earliest.year,
          "month": earliest.month,
          "day": earliest.day
        },
        "endDate": {
          "year": yesterday.year,
          "month": yesterday.month,
          "day": yesterday.day
        }
      },
      "destTable": '{project}.{dataset}.{table_name}'.format(
        project=query_details['dest_project'] if 'dest_project' in query_details else self.project, 
        dataset=query_details['dest_dataset'] if 'dest_dataset' in query_details else 'adh_results',
        table_name=query_details['table_name']
      ),
      "customerId": query_details['customer_id']
    }
    result = service.customers().analysisQueries().start(
      name=query_details['details']['name'], body=body).execute()

    return result
Пример #2
0
class Report2BQ(object):
    def __init__(self,
                 product: Type,
                 email=None,
                 project=None,
                 report_id=None,
                 profile=None,
                 sa360_url=None,
                 force: bool = False,
                 append: bool = False,
                 infer_schema: bool = False,
                 dest_project: str = None,
                 dest_dataset: str = 'report2bq',
                 notify_topic: str = None,
                 notify_message: str = None):
        self.product = product

        self.force = force
        self.email = email
        self.append = append
        self.infer_schema = infer_schema

        self.report_id = report_id

        self.sa360_url = unquote(sa360_url) if sa360_url else None

        self.cm_profile = profile

        self.project = project

        self.dest_project = dest_project
        self.dest_dataset = dest_dataset

        self.notify_topic = notify_topic
        self.notify_message = notify_message

        self.firestore = Firestore(email=email, project=project)

    def handle_report_fetcher(self, fetcher: ReportFetcher):
        # Get Latest Report
        report_object = fetcher.get_latest_report_file(self.report_id)

        # Normalize Report Details
        report_data = fetcher.fetch_report_config(report_object=report_object,
                                                  report_id=self.report_id)
        last_report = self.firestore.get_report_config(fetcher.report_type,
                                                       self.report_id)

        if last_report:
            if report_data['last_updated'] == last_report[
                    'last_updated'] and not self.force:
                logging.info('No change: ignoring.')
                return

        report_data = fetcher.normalize_report_details(
            report_object=report_object, report_id=self.report_id)

        report_data['email'] = self.email
        report_data['append'] = self.append

        if self.dest_project: report_data['dest_project'] = self.dest_project
        if self.dest_dataset: report_data['dest_dataset'] = self.dest_dataset
        if self.notify_topic:
            report_data['notifier'] = {
                'topic': self.notify_topic,
            }
            if self.notify_message:
                report_data['notifier']['message'] = self.notify_message

        if report_object:
            csv_header, csv_types = fetcher.read_header(report_data)
            if csv_header:
                schema = CSVHelpers.create_table_schema(
                    csv_header, csv_types if self.infer_schema else None)

                report_data['schema'] = schema
                fetcher.stream_to_gcs(f'{self.project}-report2bq-upload',
                                      report_data)

        self.firestore.store_report_config(fetcher.report_type, self.report_id,
                                           report_data)

    def handle_sa360(self):
        sa360 = SA360(project=self.project,
                      email=self.email,
                      infer_schema=self.infer_schema,
                      append=self.append)
        logging.info(self.sa360_url)
        id = re.match(r'^.*rid=([0-9]+).*$', self.sa360_url).group(1)
        report_data = self.firestore.get_report_config(Type.SA360, id)

        if not report_data:
            # Create new report details structure
            report_data = {'id': id, 'url': self.sa360_url}
            report_data['table_name'] = 'SA360_{id}'.format(id=id)
            report_data['email'] = self.email

        if self.dest_project: report_data['dest_project'] = self.dest_project
        if self.dest_dataset: report_data['dest_dataset'] = self.dest_dataset
        if self.notify_topic:
            report_data['notifier'] = {
                'topic': self.notify_topic,
            }
            if self.notify_message:
                report_data['notifier']['message'] = self.notify_message
        sa360.process(
            bucket='{project}-report2bq-upload'.format(project=self.project),
            report_details=report_data)

        self.firestore.store_report_config(Type.SA360, id, report_data)

    def run(self):
        logging.info(f'Product: {self.product}')
        if self.product in [Type.DV360, Type.CM]:
            fetcher = FetcherFactory.create_fetcher(self.product,
                                                    email=self.email,
                                                    project=self.project,
                                                    profile=self.cm_profile)
            self.handle_report_fetcher(fetcher=fetcher)

        elif self.product == Type.SA360:
            self.handle_sa360()