def add_check( self, database: str, urlset: str, check: str, value: str, valid: bool, diff: str, error: str, url_protocol: str, url_domain: str, url_path: str, url_query: str ): if 'bigquery' == database: if type(self._bigquery) is not BigQuery: raise ConfigurationMissingError('Missing a bigquery connection') self._bigquery.add_check(urlset, check, str(value), valid, diff, error, url_protocol, url_domain, url_path, url_query) else: if type(self._orm) is not ORM: raise ConfigurationMissingError('Missing a orm connection') url = url_protocol + '://' + url_domain + url_path + url_query if url in self._cached_url_ids: url_id = self._cached_url_ids[url] else: url_id = self._urlset_urls_table.add(urlset, url_protocol, url_domain, url_path, url_query) self._urlset_checks_table.add(urlset, url_id, check, valid, value, diff, error)
def _process_configuration(self, configuration: dict, database: str): if 'apiKey' in configuration and type(configuration['apiKey']) is str: api_key = configuration['apiKey'] else: raise ConfigurationMissingError( 'Missing api key for configuration') if 'username' in configuration and type( configuration['username']) is str: username = configuration['username'] else: raise ConfigurationMissingError( 'Missing username for configuration') if 'projectId' in configuration and type( configuration['projectId']) is str: project_id = int(configuration['projectId']) else: raise ConfigurationMissingError( 'Missing project id for configuration') table_reference = None if 'bigquery' == database: if 'tablename' in configuration and type( configuration['tablename']) is str: table_name = configuration['tablename'] else: raise ConfigurationMissingError( 'Missing tablename for pagespeed to bigquery') dataset_name = None if 'dataset' in configuration and type( configuration['dataset']) is str: dataset_name = configuration['dataset'] table_reference = self.connection.bigquery.table_reference( table_name, dataset_name) print('Project: {:d}'.format(project_id), end='') if 'bigquery' == database and self._bigquery_check_has_existing_data( project_id, table_reference): raise _DataAlreadyExistError() elif 'mongodb' == database and self._mongodb_check_has_existing_data( project_id): raise _DataAlreadyExistError() client = RankalysApiClient(api_key, username) rankings = self._process_keyword_ranking(client, project_id) if 'bigquery' == database: self._process_response_for_bigquery(table_reference, rankings) else: self._process_responses_for_mongodb(rankings)
def _process_matches_configuration(configuration_matches: list) -> list: matches = [] for configuration_match in configuration_matches: expressions = [] match = deepcopy(configuration_match) if 'fallback' not in match: match['fallback'] = '' if 'inputField' not in match: raise ConfigurationMissingError( 'missing inputField for match configuration') if 'outputField' not in match: raise ConfigurationMissingError( 'missing outputField for match configuration') if 'expressions' in match and type(match['expressions']) is list: for expression in match['expressions']: case_sensitive = True if 'caseSensitive' in expression: case_sensitive = bool(expression['caseSensitive']) expression['caseSensitive'] = case_sensitive if 'regex' not in expression and 'csv' not in expression: raise ConfigurationMissingError( 'Missing expression or csv') elif 'csv' in expression: csv_file_path = realpath(expression['csv']) if not isfile(csv_file_path): raise ConfigurationMissingError( 'CSV path "{:s}" does not exist'.format( expression['csv'])) expression['csv'] = read_csv(csv_file_path) expression['useRegex'] = bool(expression['useRegex']) \ if 'useRegex' in expression else False elif 'regex' in expression: expression['regex'] = re.compile(expression['regex']) \ if case_sensitive else \ re.compile(expression['regex'], re.IGNORECASE) expressions.append(expression) else: raise ConfigurationMissingError( 'missing expressions for match configuration') match['expressions'] = expressions matches.append(match) return matches
def __init__(self, configuration: Configuration): if type(configuration.databases.bigquery) is not ConfigurationBigQuery: raise ConfigurationMissingError('No bigquery connection configured') self._configuration = configuration self._client = None self._dataset = None self._additional_datasets = {} self._connected = False self._insert_batch = {}
def run(self): print('Running XPath Module:') timer_run = time() table_reference = None if 'bigquery' == self.module_configuration.database: self.bigquery = self.connection.bigquery if 'dataset' in self.module_configuration.settings and \ type(self.module_configuration.settings['dataset']) is str: dataset = self.module_configuration.settings['dataset'] else: raise ConfigurationMissingError('Missing dataset for xpath module settings') if 'tablename' in self.module_configuration.settings and \ type(self.module_configuration.settings['tablename']) is str: tablename = self.module_configuration.settings['tablename'] else: raise ConfigurationMissingError('Missing dataset for xpath module settings') table_reference = self.bigquery.table_reference(tablename, dataset) else: self.mongodb = self.connection.mongodb clusters = {} if 'clusters' in self.module_configuration.settings and \ type(self.module_configuration.settings['clusters']) is dict: clusters = self._process_clusters( self.module_configuration.settings['clusters'] ) if 'configurations' in self.module_configuration.settings and \ type(self.module_configuration.settings['configurations']) is list: self._process_configurations( self.module_configuration.settings['configurations'], clusters, self.module_configuration.database, table_reference ) print('\ncompleted: {:s}'.format(str(timedelta(seconds=int(time() - timer_run)))))
def __init__(self, configuration: Configuration, configuration_key: str, connection: Connection): if not connection.has_bigquery() and not connection.has_orm(): raise ConfigurationMissingError( 'Missing a database configuration for this operation') self.configuration = configuration self.module_configuration = configuration.operations.get_custom_configuration_operation( configuration_key) self.mongodb = connection.mongodb self.check_service = Check(connection)
def __init__(self, configuration: Configuration, configuration_key: str, connection: Connection): if not connection.has_bigquery() and not connection.has_mongodb(): raise ConfigurationMissingError( 'Missing a database configuration for this operation') self.configuration = configuration self.module_configuration = configuration.operations.get_custom_configuration_operation( configuration_key) self.connection = connection self.mongodb = connection.mongodb self.bigquery = None self.matching_group_regex = re.compile(r'\$(\d+)')
def __init__(self, configuration: Configuration): self._engine = None self._connection = None self._configuration = configuration self.tables = None self._connected = False if type(configuration.databases.orm) is ConfigurationORM: self._engine = create_engine( configuration.databases.orm.connection_url) else: raise ConfigurationMissingError( 'No orm database connection configured')
def _regex_matches(content: str, options: dict) -> list: if 'expression' in options and type(options['expression']) is str: case_sensitive = False if 'caseSensitive' in options and type(options['caseSensitive']) is bool: case_sensitive = options['caseSensitive'] regex = re.compile(options['expression'], 0 if case_sensitive else re.IGNORECASE) else: raise ConfigurationMissingError('Missing expression for regex operation') matches = regex.findall(content) processed_matches = [] for match in matches: if type(match) is str: processed_matches.append(match) elif type(match) is tuple: processed_matches.append('(' + '),('.join(match) + ')') return processed_matches
def run(self): print('Running operation GSC Matching:') processing_configurations = [] if self.mongodb.has_collection( AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY): for retry in self.mongodb.find( AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY, {'module': 'operation'}, True): del retry['module'] retry['requestDate'] = retry['requestDate'].date() processing_configurations.append(retry) if 'properties' in self.module_configuration.settings and \ type(self.module_configuration.settings['properties']) is list: for property_configuration in self.module_configuration.settings[ 'properties']: input_dataset = None output_dataset = None exclude_input_fields = [] matches = [] request_days_ago = 3 if 'property' in property_configuration and type( property_configuration['property']) is str: gsc_property = property_configuration['property'] else: raise ConfigurationMissingError('property is missing') if 'inputTable' in property_configuration and type( property_configuration['inputTable']) is str: input_table = property_configuration['inputTable'] else: raise ConfigurationMissingError('input table is missing') if 'outputTable' in property_configuration and type( property_configuration['outputTable']) is str: output_table = property_configuration['outputTable'] else: raise ConfigurationMissingError('output table is missing') if 'inputDataset' in property_configuration and type( property_configuration['inputDataset']) is str: input_dataset = property_configuration['inputDataset'] elif 'bigquery' == self.module_configuration.database: raise ConfigurationMissingError('input dataset is missing') if 'outputDataset' in property_configuration and type( property_configuration['outputDataset']) is str: output_dataset = property_configuration['outputDataset'] elif 'bigquery' == self.module_configuration.database: raise ConfigurationMissingError( 'output dataset is missing') if 'excludeInputFields' in property_configuration and type( property_configuration['excludeInputFields']) is list: exclude_input_fields = property_configuration[ 'excludeInputFields'] if 'matches' in property_configuration and type( property_configuration['matches']) is list: matches = property_configuration['matches'] if 'dateDaysAgo' in property_configuration and type( property_configuration['dateDaysAgo']) is int: request_days_ago = property_configuration['dateDaysAgo'] request_date = date.today() - timedelta(days=request_days_ago) processing_configuration = { 'database': self.module_configuration.database, 'property': gsc_property, 'requestDate': request_date, 'inputTable': input_table, 'inputDataset': input_dataset, 'outputTable': output_table, 'outputDataset': output_dataset, 'excludeInputFields': exclude_input_fields, 'matches': matches, } if 0 == len( list( filter(lambda x: x == processing_configuration, processing_configurations))): processing_configurations.append(processing_configuration) for processing_configuration in processing_configurations: if 'bigquery' == processing_configuration['database'] and type( self.bigquery) is not BigQuery: self.bigquery = self.connection.bigquery try: self._process_property( processing_configuration['database'], processing_configuration['property'], processing_configuration['requestDate'], processing_configuration['inputTable'], processing_configuration['outputTable'], processing_configuration['inputDataset'], processing_configuration['outputDataset'], processing_configuration['excludeInputFields'], self._process_matches_configuration( processing_configuration['matches'])) if '_id' in processing_configuration: self.mongodb.delete_one( AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY, processing_configuration['_id']) except _DataNotAvailableYet: existing_retry = None if self.mongodb.has_collection( AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY): existing_retry = self.mongodb.find_one( AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY, { 'module': 'operation', 'property': processing_configuration['property'], 'requestDate': datetime.combine( processing_configuration['requestDate'], datetime.min.time()), 'inputTable': processing_configuration['inputTable'], 'outputTable': processing_configuration['outputTable'], }) if existing_retry is None: self.mongodb.insert_document( AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY, { 'module': 'operation', 'database': processing_configuration['database'], 'property': processing_configuration['property'], 'requestDate': datetime.combine( processing_configuration['requestDate'], datetime.min.time()), 'inputTable': processing_configuration['inputTable'], 'inputDataset': processing_configuration['inputDataset'], 'outputTable': processing_configuration['outputTable'], 'outputDataset': processing_configuration['outputDataset'], 'excludeInputFields': processing_configuration['excludeInputFields'], 'matches': processing_configuration['matches'] })
def run(self): print('Running aggregation GSC Importer:') timer_run = time() import_properties = [] if self.mongodb.has_collection( GoogleSearchConsole.COLLECTION_NAME_RETRY): for retry in self.mongodb.find( GoogleSearchConsole.COLLECTION_NAME_RETRY, {'module': 'aggregation'}, True): del retry['module'] retry['requestDate'] = retry['requestDate'].date() import_properties.append(retry) if 'properties' in self.module_configuration.settings and \ type(self.module_configuration.settings['properties']) is list: for property_configuration in self.module_configuration.settings[ 'properties']: credentials = None request_days_ago = 3 dimensions = GoogleSearchConsole.DEFAULT_DIMENSIONS search_types = GoogleSearchConsole.DEFAULT_SEARCHTYPES previous_data = [] if 'property' in property_configuration and type( property_configuration['property']) is str: gsc_property = property_configuration['property'] else: raise ConfigurationMissingError('property is missing') if 'credentials' in property_configuration and type( property_configuration['credentials']) is str: credentials = property_configuration['credentials'] if 'dateDaysAgo' in property_configuration and type( property_configuration['dateDaysAgo']) is int: request_days_ago = property_configuration['dateDaysAgo'] if 'dimensions' in property_configuration and type( property_configuration['dimensions']) is list: dimensions = property_configuration['dimensions'] if 'searchTypes' in property_configuration and type( property_configuration['searchTypes']) is list: search_types = property_configuration['searchTypes'] if 'previousData' in property_configuration and \ type(property_configuration['previousData']) is list: previous_data = property_configuration['previousData'] if 'aggregationType' in property_configuration and \ type(property_configuration['aggregationType']) is str: aggregation_type = property_configuration[ 'aggregationType'] else: aggregation_type = '' request_date = date.today() - timedelta(days=request_days_ago) table_name = None dataset_name = None if 'bigquery' == self.module_configuration.database: if 'tablename' in property_configuration and type( property_configuration['tablename']) is str: table_name = property_configuration['tablename'] else: raise ConfigurationMissingError( 'Missing tablename for gsc import to bigquery') if 'dataset' in property_configuration and type( property_configuration['dataset']) is str: dataset_name = property_configuration['dataset'] if type(self.bigquery) is not BigQuery: self.bigquery = self.connection.bigquery import_property = { 'credentials': credentials, 'property': gsc_property, 'requestDate': request_date, 'dimensions': dimensions, 'searchTypes': search_types, 'previousData': previous_data, 'aggregationType': aggregation_type, 'database': self.module_configuration.database, 'tableName': table_name, 'datasetName': dataset_name, } if 0 == len( list( filter(lambda x: x == import_property, import_properties))): import_properties.append(import_property) for import_property in import_properties: try: credentials = None if 'credentials' in import_property and type( import_property['credentials']) is str: credentials = service_account.Credentials.from_service_account_file( abspath(import_property['credentials']), scopes=[ 'https://www.googleapis.com/auth/webmasters.readonly' ]) api_service = build('webmasters', 'v3', credentials=credentials, cache_discovery=False) self.import_property(api_service, import_property['property'], import_property['requestDate'], import_property['dimensions'], import_property['searchTypes'], import_property['previousData'], import_property['aggregationType'], import_property['database'], import_property['tableName'], import_property['datasetName']) if '_id' in import_property: self.mongodb.delete_one( GoogleSearchConsole.COLLECTION_NAME_RETRY, import_property['_id']) except _DataAlreadyExistError: if '_id' in import_property: self.mongodb.delete_one( GoogleSearchConsole.COLLECTION_NAME_RETRY, import_property['_id']) print(' !!! already exists') except _DataNotAvailableYet: print(' !!! not available yet') except (UnknownApiNameOrVersion, HttpError) as api_error: print(' !!! ERROR') print(api_error) existing_retry = None if self.mongodb.has_collection( GoogleSearchConsole.COLLECTION_NAME_RETRY): existing_retry = self.mongodb.find_one( GoogleSearchConsole.COLLECTION_NAME_RETRY, { 'property': import_property['property'], 'requestDate': datetime.combine(import_property['requestDate'], datetime.min.time()), }) if existing_retry is None: self.mongodb.insert_document( GoogleSearchConsole.COLLECTION_NAME_RETRY, { 'module': 'aggregation', 'credentials': import_property['credentials'], 'property': import_property['property'], 'requestDate': datetime.combine(import_property['requestDate'], datetime.min.time()), 'dimensions': import_property['dimensions'], 'searchTypes': import_property['searchTypes'], 'previousData': import_property['previousData'], 'aggregationType': import_property['aggregationType'], 'database': import_property['database'], 'tableName': import_property['tableName'], 'datasetName': import_property['datasetName'], }) print('\ncompleted: {:s}'.format( str(timedelta(seconds=int(time() - timer_run)))))
def _process_configuration(self, configuration: dict, database: str): parameters = {} dataset = None table_reference = None if 'apiKey' in configuration and type(configuration['apiKey']) is str: api_key = configuration['apiKey'] else: raise ConfigurationMissingError( 'Missing API Key for configuration') if 'projects' in configuration and type( configuration['projects']) is list: projects = configuration['projects'] else: raise ConfigurationMissingError( 'Missing project for configuration') if 'method' in configuration and type(configuration['method']) is str: method = configuration['method'] if not method.startswith('optimizer.'): method = 'optimizer.' + configuration['method'] if SistrixApiClient.ENDPOINT_OPTIMIZER_VISIBILITY == method: method = SistrixApiClient.ENDPOINT_OPTIMIZER_VISIBILITY schema = ( SchemaField('request_date', SqlTypeNames.DATETIME, 'REQUIRED'), SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'), SchemaField('source', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('type', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('value', SqlTypeNames.FLOAT, 'REQUIRED'), ) elif SistrixApiClient.ENDPOINT_OPTIMIZER_RANKING == method: method = SistrixApiClient.ENDPOINT_OPTIMIZER_RANKING schema = ( SchemaField('request_date', SqlTypeNames.DATETIME, 'REQUIRED'), SchemaField('keyword', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('position', SqlTypeNames.INTEGER, 'REQUIRED'), SchemaField('positionOverflow', SqlTypeNames.BOOL, 'REQUIRED'), SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('tags', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('device', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('country', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('traffic', SqlTypeNames.INTEGER, 'REQUIRED'), SchemaField('searchengine', SqlTypeNames.STRING, 'REQUIRED'), ) else: raise ConfigurationInvalidError( 'Invalid method "{}" in configuration'.format( configuration['method'])) else: raise ConfigurationMissingError('Missing method for configuration') if 'parameters' in configuration and type( configuration['parameters']) is dict: parameters = configuration['parameters'] if 'dataset' in configuration and type( configuration['dataset']) is str: dataset = configuration['dataset'] if 'bigquery' == database: if type(self.bigquery) is not BigQuery: self.bigquery = self.connection.bigquery if 'table' in configuration and type( configuration['table']) is str: table_reference = self.bigquery.table_reference( configuration['table'], dataset) else: raise ConfigurationMissingError( 'You have to set at least a table if you want to use bigquery' ) api_client = SistrixApiClient(api_key) responses = [] request_date = datetime.utcnow().replace(tzinfo=timezone('UTC')) request = { 'date': request_date.astimezone(timezone('Europe/Berlin')), **parameters } for project in projects: request['project'] = project try: if SistrixApiClient.ENDPOINT_OPTIMIZER_VISIBILITY == method: responses.extend( self._process_visibility_response( api_client.request(method, request), request_date)) elif SistrixApiClient.ENDPOINT_OPTIMIZER_RANKING == method: if 'limit' not in request: request['limit'] = self.DEFAULT_API_RANKING_LIMIT responses.extend( self._process_ranking_response( api_client.request(method, request), request_date)) except SistrixApiError as error: print('API Error: ' + error.message) if 'bigquery' == self.module_configuration.database: self._process_responses_for_bigquery(responses, schema, table_reference) else: self._process_responses_for_mongodb(responses)
def _process_request_configuration(self, configuration: dict, database: str): api_key = '' domain = None host = None paths = None urls = None daily = False on_weekday = None add_parameters_to_result = False methods = [] dataset = None table_reference = None request_date = datetime.now().date() requests = [] if 'apiKey' in configuration and type(configuration['apiKey']) is str: api_key = configuration['apiKey'] if 'domain' in configuration and type(configuration['domain']) is str: domain = configuration['domain'] if 'host' in configuration and type(configuration['host']) is str: host = configuration['host'] if 'paths' in configuration and type(configuration['paths']) is list: paths = configuration['paths'] if 'urls' in configuration and type(configuration['urls']) is list: urls = configuration['urls'] if 'onlyOnWeekday' in configuration and ( type(configuration['onlyOnWeekday']) is str or type(configuration['onlyOnWeekday']) is int): on_weekday = configuration['onlyOnWeekday'] else: daily = True if 'addParametersToResult' in configuration and type( configuration['addParametersToResult']) is bool: add_parameters_to_result = configuration['addParametersToResult'] if 'methods' in configuration and type( configuration['methods']) is list: for method in configuration['methods']: if 'method' not in method and type( method['method']) is not str: raise ConfigurationMissingError('Missing api method') elif not method['method'].startswith('domain.'): method['method'] = 'domain.' + method['method'] if method[ 'method'] not in SistrixDomain.METHODS_PARAMETERS_ALLOWED.keys( ): raise ConfigurationInvalidError( 'The method "{}" is not allowed'.format( method['method'])) if 'fieldName' not in method and type( method['fieldName']) is not str: raise ConfigurationMissingError( 'Missing a field name in api method') if 'parameters' not in method: method['parameters'] = {} elif type(method['parameters']) is not dict: raise ConfigurationInvalidError( 'Method parameters must be type of dictionary') for parameter in method['parameters']: if parameter not in SistrixDomain.METHODS_PARAMETERS_ALLOWED[ method['method']]: raise ConfigurationInvalidError( 'The parameter "{}" for "{}" is not allowed in this module' .format(parameter, method['method'])) if method['method'] in SistrixDomain.DAILY_PARAMETER_ALLOWED: method['parameters']['daily'] = daily methods.append(method) if 0 == len(methods): raise ConfigurationMissingError('Missing methods to request') if 'dataset' in configuration and type( configuration['dataset']) is str: dataset = configuration['dataset'] if 'bigquery' == database: if type(self.bigquery) is not BigQuery: self.bigquery = self.connection.bigquery if 'table' in configuration and type( configuration['table']) is str: table_reference = self.bigquery.table_reference( configuration['table'], dataset) else: raise ConfigurationMissingError( 'You have to set at least a table if you want to use bigquery' ) if (domain is not None and (host is not None or paths is not None or urls is not None)) or \ (host is not None and (domain is not None or paths is not None or urls is not None)) or \ (paths is not None and (host is not None or domain is not None or urls is not None)) or \ (urls is not None and (host is not None or paths is not None or domain is not None)): raise ConfigurationInvalidError( 'You can\'t use domain, host, path or url parallel to each other' ) if domain is None and host is None and paths is None and urls is None: raise ConfigurationInvalidError( 'You need one of these parameters: "domain, host, path, url"') if on_weekday is not None and ( # weekday format may get influnced by locale on_weekday != '{:%a}'.format(datetime.now()) and on_weekday != '{:%A}'.format(datetime.now()) and on_weekday != datetime.now().isoweekday()): return if domain is not None: requests.append({'domain': domain}) if host is not None: requests.append({'host': host}) if paths is not None: for path in paths: requests.append({'path': path}) if urls is not None: for url in urls: requests.append({'url': url}) responses = [] sistrix_api_client = SistrixApiClient(api_key) for request in requests: for key, value in request.items(): response_row = {} for method in methods: if 'bigquery' == database: if self._bigquery_check_has_existing_data( table_reference, request_date, add_parameters_to_result, method['parameters']): continue else: if self._mongodb_check_has_existing_data( request_date, method['parameters']): continue response_row = self._sistrix_api_requests( sistrix_api_client, method, response_row, add_parameters_to_result, **{key: value}) if add_parameters_to_result: responses.append({ key: value, 'date': request_date, **response_row }) if not add_parameters_to_result: responses.append({ key: value, 'date': request_date, **response_row }) if 0 < len(responses): if table_reference is None and 'mongodb' == database: self.mongodb.insert_documents(SistrixDomain.COLLECTION_NAME, responses) elif 'bigquery' == database: self._process_response_rows_for_bigquery( responses, methods, table_reference) else: ConfigurationInvalidError( 'Invalid database configuration for this module')
def _process_configuration(self, configuration: dict, database: str): credentials = None dimensions = None metrics = None segment_id = None dimension_filter_clauses = {} metric_filter_clauses = {} table_reference = None if 'credentials' in configuration and type( configuration['credentials']) is str: credentials = service_account.Credentials.from_service_account_file( abspath(configuration['credentials']), scopes=['https://www.googleapis.com/auth/analytics.readonly']) logging.getLogger('googleapiclient.discovery').setLevel(logging.ERROR) self.api_service = build('analyticsreporting', 'v4', credentials=credentials, cache_discovery=False) if 'bigquery' == database: dataset = None if type(self.bigquery) is not BigQuery: self.bigquery = self.connection.bigquery if 'dataset' in configuration and type( configuration['dataset']) is str: dataset = configuration['dataset'] if 'tablename' in configuration and type( configuration['tablename']) is str: table_reference = self.bigquery.table_reference( configuration['tablename'], dataset) else: raise ConfigurationMissingError( 'You have to set at least a table if you want to use bigquery' ) if 'dimensions' in configuration and type( configuration['dimensions']) is list: dimensions = configuration['dimensions'] if 'metrics' in configuration and type( configuration['metrics']) is list: metrics = configuration['metrics'] if 'dateDaysAgo' in configuration and type( configuration['dateDaysAgo']) is int: request_date = date.today() - timedelta( days=configuration['dateDaysAgo']) else: request_date = date.today() - timedelta( days=GoogleAnalytics.DEFAULT_DAYS_AGO) if 'segmentId' in configuration and ( type(configuration['segmentId']) is str or type(configuration['segmentId']) is int or type(configuration['segmentId']) is float): segment_id = str(configuration['segmentId']) if 'dimensionFilterClauses' in configuration and type( configuration['dimensionFilterClauses']) is dict: dimension_filter_clauses = configuration['dimensionFilterClauses'] if 'metricFilterClauses' in configuration and type( configuration['metricFilterClauses']) is dict: metric_filter_clauses = configuration['metricFilterClauses'] if 'views' in configuration and type(configuration['views']) is list: for view in configuration['views']: try: self._import_view(int(view), dimensions, metrics, segment_id, dimension_filter_clauses, metric_filter_clauses, request_date, database, table_reference) print(' - OK') sleep(10) except _DataAlreadyExistError: print(' - EXISTS')
def _process_configurations( self, configurations: list, clusters: dict, database: str, table_reference: TableReference ): data = [] for configuration in configurations: operation = None operation_options = None if 'query' in configuration and type(configuration['query']) is str: query = configuration['query'] else: raise ConfigurationMissingError('Missing xpath query for configuration') if 'operation' in configuration and type(configuration['operation']) is str: operation = configuration['operation'].lower() if operation not in self.SUPPORTED_OPERATIONS: raise ConfigurationInvalidError('Invalid operation for xpath configuration') if 'operationOptions' in configuration and type(configuration['operationOptions']) is dict: operation_options = configuration['operationOptions'] if 'name' in configuration and type(configuration['name']) is str: name = configuration['name'] else: raise ConfigurationMissingError('Missing xpath name for configuration') if 'url' in configuration and type(configuration['url']) is str: if not Validator.validate_url(configuration['url']): raise ConfigurationInvalidError('Invalid url in xpath configuartion') data.append({ 'url': configuration['url'], 'query': query, 'name': name, 'cluster': None, 'date': datetime.utcnow(), 'elements': self._run_operation_on_elements( self._xpath_query_on_html( self._get_html_from_url(configuration['url']), query ), operation, operation_options ) }) elif 'cluster' in configuration: clusters_configuration = None if type(configuration['cluster']) is dict: clusters_configuration = configuration['cluster'] elif type(configuration['cluster']) is str: if configuration['cluster'] in clusters: clusters_configuration = clusters[configuration['cluster']] else: cluster, subcluster = configuration['cluster'].split(sep=Xpath.DEFAULT_MATCH_SEPERATOR) if cluster in clusters and subcluster in clusters[cluster]: clusters_configuration = {subcluster: clusters[cluster][subcluster]} if type(clusters_configuration) is not dict: raise ConfigurationMissingError('Missing cluster configuration') for cluster, urls in clusters_configuration.items(): for url in urls: if type(url) is not str: raise ConfigurationInvalidError('Invalid url') elif not Validator.validate_url(url): raise ConfigurationInvalidError('Invalid url') data.append({ 'url': url, 'query': query, 'name': name, 'cluster': cluster, 'date': datetime.utcnow(), 'elements': self._run_operation_on_elements( self._xpath_query_on_html( self._get_html_from_url(url), query ), operation, operation_options ) }) else: raise ConfigurationMissingError('Missing url parameter for xpath configuration') if 'bigquery' == database: self._process_responses_for_bigquery(data, table_reference) else: self._process_responses_for_mongodb(data)