Exemplo n.º 1
0
    def add_check(
            self,
            database: str,
            urlset: str,
            check: str,
            value: str,
            valid: bool,
            diff: str,
            error: str,
            url_protocol: str,
            url_domain: str,
            url_path: str,
            url_query: str
    ):
        if 'bigquery' == database:
            if type(self._bigquery) is not BigQuery:
                raise ConfigurationMissingError('Missing a bigquery connection')

            self._bigquery.add_check(urlset, check, str(value), valid, diff, error, url_protocol, url_domain, url_path, url_query)
        else:
            if type(self._orm) is not ORM:
                raise ConfigurationMissingError('Missing a orm connection')

            url = url_protocol + '://' + url_domain + url_path + url_query

            if url in self._cached_url_ids:
                url_id = self._cached_url_ids[url]
            else:
                url_id = self._urlset_urls_table.add(urlset, url_protocol, url_domain, url_path, url_query)

            self._urlset_checks_table.add(urlset, url_id, check, valid, value, diff, error)
    def _process_configuration(self, configuration: dict, database: str):
        if 'apiKey' in configuration and type(configuration['apiKey']) is str:
            api_key = configuration['apiKey']
        else:
            raise ConfigurationMissingError(
                'Missing api key for configuration')

        if 'username' in configuration and type(
                configuration['username']) is str:
            username = configuration['username']
        else:
            raise ConfigurationMissingError(
                'Missing username for configuration')

        if 'projectId' in configuration and type(
                configuration['projectId']) is str:
            project_id = int(configuration['projectId'])
        else:
            raise ConfigurationMissingError(
                'Missing project id for configuration')

        table_reference = None

        if 'bigquery' == database:
            if 'tablename' in configuration and type(
                    configuration['tablename']) is str:
                table_name = configuration['tablename']
            else:
                raise ConfigurationMissingError(
                    'Missing tablename for pagespeed to bigquery')

            dataset_name = None

            if 'dataset' in configuration and type(
                    configuration['dataset']) is str:
                dataset_name = configuration['dataset']

            table_reference = self.connection.bigquery.table_reference(
                table_name, dataset_name)

        print('Project: {:d}'.format(project_id), end='')

        if 'bigquery' == database and self._bigquery_check_has_existing_data(
                project_id, table_reference):
            raise _DataAlreadyExistError()
        elif 'mongodb' == database and self._mongodb_check_has_existing_data(
                project_id):
            raise _DataAlreadyExistError()

        client = RankalysApiClient(api_key, username)

        rankings = self._process_keyword_ranking(client, project_id)

        if 'bigquery' == database:
            self._process_response_for_bigquery(table_reference, rankings)
        else:
            self._process_responses_for_mongodb(rankings)
Exemplo n.º 3
0
    def _process_matches_configuration(configuration_matches: list) -> list:
        matches = []

        for configuration_match in configuration_matches:
            expressions = []
            match = deepcopy(configuration_match)

            if 'fallback' not in match:
                match['fallback'] = ''

            if 'inputField' not in match:
                raise ConfigurationMissingError(
                    'missing inputField for match configuration')

            if 'outputField' not in match:
                raise ConfigurationMissingError(
                    'missing outputField for match configuration')

            if 'expressions' in match and type(match['expressions']) is list:
                for expression in match['expressions']:
                    case_sensitive = True

                    if 'caseSensitive' in expression:
                        case_sensitive = bool(expression['caseSensitive'])

                    expression['caseSensitive'] = case_sensitive

                    if 'regex' not in expression and 'csv' not in expression:
                        raise ConfigurationMissingError(
                            'Missing expression or csv')
                    elif 'csv' in expression:
                        csv_file_path = realpath(expression['csv'])

                        if not isfile(csv_file_path):
                            raise ConfigurationMissingError(
                                'CSV path "{:s}" does not exist'.format(
                                    expression['csv']))

                        expression['csv'] = read_csv(csv_file_path)
                        expression['useRegex'] = bool(expression['useRegex']) \
                            if 'useRegex' in expression else False
                    elif 'regex' in expression:
                        expression['regex'] = re.compile(expression['regex']) \
                            if case_sensitive else \
                            re.compile(expression['regex'], re.IGNORECASE)

                    expressions.append(expression)
            else:
                raise ConfigurationMissingError(
                    'missing expressions for match configuration')

            match['expressions'] = expressions

            matches.append(match)

        return matches
Exemplo n.º 4
0
    def __init__(self, configuration: Configuration):
        if type(configuration.databases.bigquery) is not ConfigurationBigQuery:
            raise ConfigurationMissingError('No bigquery connection configured')

        self._configuration = configuration
        self._client = None
        self._dataset = None
        self._additional_datasets = {}
        self._connected = False
        self._insert_batch = {}
Exemplo n.º 5
0
    def run(self):
        print('Running XPath Module:')
        timer_run = time()
        table_reference = None

        if 'bigquery' == self.module_configuration.database:
            self.bigquery = self.connection.bigquery

            if 'dataset' in self.module_configuration.settings and \
                    type(self.module_configuration.settings['dataset']) is str:
                dataset = self.module_configuration.settings['dataset']
            else:
                raise ConfigurationMissingError('Missing dataset for xpath module settings')

            if 'tablename' in self.module_configuration.settings and \
                    type(self.module_configuration.settings['tablename']) is str:
                tablename = self.module_configuration.settings['tablename']
            else:
                raise ConfigurationMissingError('Missing dataset for xpath module settings')

            table_reference = self.bigquery.table_reference(tablename, dataset)
        else:
            self.mongodb = self.connection.mongodb

        clusters = {}

        if 'clusters' in self.module_configuration.settings and \
                type(self.module_configuration.settings['clusters']) is dict:
            clusters = self._process_clusters(
                self.module_configuration.settings['clusters']
            )

        if 'configurations' in self.module_configuration.settings and \
                type(self.module_configuration.settings['configurations']) is list:
            self._process_configurations(
                self.module_configuration.settings['configurations'],
                clusters,
                self.module_configuration.database,
                table_reference
            )

        print('\ncompleted: {:s}'.format(str(timedelta(seconds=int(time() - timer_run)))))
Exemplo n.º 6
0
    def __init__(self, configuration: Configuration, configuration_key: str,
                 connection: Connection):
        if not connection.has_bigquery() and not connection.has_orm():
            raise ConfigurationMissingError(
                'Missing a database configuration for this operation')

        self.configuration = configuration
        self.module_configuration = configuration.operations.get_custom_configuration_operation(
            configuration_key)
        self.mongodb = connection.mongodb
        self.check_service = Check(connection)
Exemplo n.º 7
0
    def __init__(self, configuration: Configuration, configuration_key: str,
                 connection: Connection):
        if not connection.has_bigquery() and not connection.has_mongodb():
            raise ConfigurationMissingError(
                'Missing a database configuration for this operation')

        self.configuration = configuration
        self.module_configuration = configuration.operations.get_custom_configuration_operation(
            configuration_key)
        self.connection = connection
        self.mongodb = connection.mongodb
        self.bigquery = None
        self.matching_group_regex = re.compile(r'\$(\d+)')
Exemplo n.º 8
0
    def __init__(self, configuration: Configuration):
        self._engine = None
        self._connection = None
        self._configuration = configuration
        self.tables = None
        self._connected = False

        if type(configuration.databases.orm) is ConfigurationORM:
            self._engine = create_engine(
                configuration.databases.orm.connection_url)
        else:
            raise ConfigurationMissingError(
                'No orm database connection configured')
Exemplo n.º 9
0
    def _regex_matches(content: str, options: dict) -> list:
        if 'expression' in options and type(options['expression']) is str:
            case_sensitive = False

            if 'caseSensitive' in options and type(options['caseSensitive']) is bool:
                case_sensitive = options['caseSensitive']

            regex = re.compile(options['expression'], 0 if case_sensitive else re.IGNORECASE)
        else:
            raise ConfigurationMissingError('Missing expression for regex operation')

        matches = regex.findall(content)
        processed_matches = []

        for match in matches:
            if type(match) is str:
                processed_matches.append(match)
            elif type(match) is tuple:
                processed_matches.append('(' + '),('.join(match) + ')')

        return processed_matches
Exemplo n.º 10
0
    def run(self):
        print('Running operation GSC Matching:')

        processing_configurations = []

        if self.mongodb.has_collection(
                AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY):
            for retry in self.mongodb.find(
                    AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY,
                {'module': 'operation'}, True):
                del retry['module']
                retry['requestDate'] = retry['requestDate'].date()
                processing_configurations.append(retry)

        if 'properties' in self.module_configuration.settings and \
                type(self.module_configuration.settings['properties']) is list:
            for property_configuration in self.module_configuration.settings[
                    'properties']:
                input_dataset = None
                output_dataset = None
                exclude_input_fields = []
                matches = []
                request_days_ago = 3

                if 'property' in property_configuration and type(
                        property_configuration['property']) is str:
                    gsc_property = property_configuration['property']
                else:
                    raise ConfigurationMissingError('property is missing')

                if 'inputTable' in property_configuration and type(
                        property_configuration['inputTable']) is str:
                    input_table = property_configuration['inputTable']
                else:
                    raise ConfigurationMissingError('input table is missing')

                if 'outputTable' in property_configuration and type(
                        property_configuration['outputTable']) is str:
                    output_table = property_configuration['outputTable']
                else:
                    raise ConfigurationMissingError('output table is missing')

                if 'inputDataset' in property_configuration and type(
                        property_configuration['inputDataset']) is str:
                    input_dataset = property_configuration['inputDataset']
                elif 'bigquery' == self.module_configuration.database:
                    raise ConfigurationMissingError('input dataset is missing')

                if 'outputDataset' in property_configuration and type(
                        property_configuration['outputDataset']) is str:
                    output_dataset = property_configuration['outputDataset']
                elif 'bigquery' == self.module_configuration.database:
                    raise ConfigurationMissingError(
                        'output dataset is missing')

                if 'excludeInputFields' in property_configuration and type(
                        property_configuration['excludeInputFields']) is list:
                    exclude_input_fields = property_configuration[
                        'excludeInputFields']

                if 'matches' in property_configuration and type(
                        property_configuration['matches']) is list:
                    matches = property_configuration['matches']

                if 'dateDaysAgo' in property_configuration and type(
                        property_configuration['dateDaysAgo']) is int:
                    request_days_ago = property_configuration['dateDaysAgo']

                request_date = date.today() - timedelta(days=request_days_ago)

                processing_configuration = {
                    'database': self.module_configuration.database,
                    'property': gsc_property,
                    'requestDate': request_date,
                    'inputTable': input_table,
                    'inputDataset': input_dataset,
                    'outputTable': output_table,
                    'outputDataset': output_dataset,
                    'excludeInputFields': exclude_input_fields,
                    'matches': matches,
                }

                if 0 == len(
                        list(
                            filter(lambda x: x == processing_configuration,
                                   processing_configurations))):
                    processing_configurations.append(processing_configuration)

        for processing_configuration in processing_configurations:
            if 'bigquery' == processing_configuration['database'] and type(
                    self.bigquery) is not BigQuery:
                self.bigquery = self.connection.bigquery

            try:
                self._process_property(
                    processing_configuration['database'],
                    processing_configuration['property'],
                    processing_configuration['requestDate'],
                    processing_configuration['inputTable'],
                    processing_configuration['outputTable'],
                    processing_configuration['inputDataset'],
                    processing_configuration['outputDataset'],
                    processing_configuration['excludeInputFields'],
                    self._process_matches_configuration(
                        processing_configuration['matches']))

                if '_id' in processing_configuration:
                    self.mongodb.delete_one(
                        AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY,
                        processing_configuration['_id'])
            except _DataNotAvailableYet:
                existing_retry = None

                if self.mongodb.has_collection(
                        AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY):
                    existing_retry = self.mongodb.find_one(
                        AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY, {
                            'module':
                            'operation',
                            'property':
                            processing_configuration['property'],
                            'requestDate':
                            datetime.combine(
                                processing_configuration['requestDate'],
                                datetime.min.time()),
                            'inputTable':
                            processing_configuration['inputTable'],
                            'outputTable':
                            processing_configuration['outputTable'],
                        })

                if existing_retry is None:
                    self.mongodb.insert_document(
                        AggregationGoogleSearchConsole.COLLECTION_NAME_RETRY, {
                            'module':
                            'operation',
                            'database':
                            processing_configuration['database'],
                            'property':
                            processing_configuration['property'],
                            'requestDate':
                            datetime.combine(
                                processing_configuration['requestDate'],
                                datetime.min.time()),
                            'inputTable':
                            processing_configuration['inputTable'],
                            'inputDataset':
                            processing_configuration['inputDataset'],
                            'outputTable':
                            processing_configuration['outputTable'],
                            'outputDataset':
                            processing_configuration['outputDataset'],
                            'excludeInputFields':
                            processing_configuration['excludeInputFields'],
                            'matches':
                            processing_configuration['matches']
                        })
Exemplo n.º 11
0
    def run(self):
        print('Running aggregation GSC Importer:')
        timer_run = time()
        import_properties = []

        if self.mongodb.has_collection(
                GoogleSearchConsole.COLLECTION_NAME_RETRY):
            for retry in self.mongodb.find(
                    GoogleSearchConsole.COLLECTION_NAME_RETRY,
                {'module': 'aggregation'}, True):
                del retry['module']
                retry['requestDate'] = retry['requestDate'].date()
                import_properties.append(retry)

        if 'properties' in self.module_configuration.settings and \
                type(self.module_configuration.settings['properties']) is list:
            for property_configuration in self.module_configuration.settings[
                    'properties']:
                credentials = None
                request_days_ago = 3
                dimensions = GoogleSearchConsole.DEFAULT_DIMENSIONS
                search_types = GoogleSearchConsole.DEFAULT_SEARCHTYPES
                previous_data = []

                if 'property' in property_configuration and type(
                        property_configuration['property']) is str:
                    gsc_property = property_configuration['property']
                else:
                    raise ConfigurationMissingError('property is missing')

                if 'credentials' in property_configuration and type(
                        property_configuration['credentials']) is str:
                    credentials = property_configuration['credentials']

                if 'dateDaysAgo' in property_configuration and type(
                        property_configuration['dateDaysAgo']) is int:
                    request_days_ago = property_configuration['dateDaysAgo']

                if 'dimensions' in property_configuration and type(
                        property_configuration['dimensions']) is list:
                    dimensions = property_configuration['dimensions']

                if 'searchTypes' in property_configuration and type(
                        property_configuration['searchTypes']) is list:
                    search_types = property_configuration['searchTypes']

                if 'previousData' in property_configuration and \
                        type(property_configuration['previousData']) is list:
                    previous_data = property_configuration['previousData']

                if 'aggregationType' in property_configuration and \
                        type(property_configuration['aggregationType']) is str:
                    aggregation_type = property_configuration[
                        'aggregationType']
                else:
                    aggregation_type = ''

                request_date = date.today() - timedelta(days=request_days_ago)
                table_name = None
                dataset_name = None

                if 'bigquery' == self.module_configuration.database:
                    if 'tablename' in property_configuration and type(
                            property_configuration['tablename']) is str:
                        table_name = property_configuration['tablename']
                    else:
                        raise ConfigurationMissingError(
                            'Missing tablename for gsc import to bigquery')

                    if 'dataset' in property_configuration and type(
                            property_configuration['dataset']) is str:
                        dataset_name = property_configuration['dataset']

                    if type(self.bigquery) is not BigQuery:
                        self.bigquery = self.connection.bigquery

                import_property = {
                    'credentials': credentials,
                    'property': gsc_property,
                    'requestDate': request_date,
                    'dimensions': dimensions,
                    'searchTypes': search_types,
                    'previousData': previous_data,
                    'aggregationType': aggregation_type,
                    'database': self.module_configuration.database,
                    'tableName': table_name,
                    'datasetName': dataset_name,
                }

                if 0 == len(
                        list(
                            filter(lambda x: x == import_property,
                                   import_properties))):
                    import_properties.append(import_property)

        for import_property in import_properties:
            try:
                credentials = None

                if 'credentials' in import_property and type(
                        import_property['credentials']) is str:
                    credentials = service_account.Credentials.from_service_account_file(
                        abspath(import_property['credentials']),
                        scopes=[
                            'https://www.googleapis.com/auth/webmasters.readonly'
                        ])

                api_service = build('webmasters',
                                    'v3',
                                    credentials=credentials,
                                    cache_discovery=False)

                self.import_property(api_service, import_property['property'],
                                     import_property['requestDate'],
                                     import_property['dimensions'],
                                     import_property['searchTypes'],
                                     import_property['previousData'],
                                     import_property['aggregationType'],
                                     import_property['database'],
                                     import_property['tableName'],
                                     import_property['datasetName'])

                if '_id' in import_property:
                    self.mongodb.delete_one(
                        GoogleSearchConsole.COLLECTION_NAME_RETRY,
                        import_property['_id'])
            except _DataAlreadyExistError:
                if '_id' in import_property:
                    self.mongodb.delete_one(
                        GoogleSearchConsole.COLLECTION_NAME_RETRY,
                        import_property['_id'])

                print(' !!! already exists')
            except _DataNotAvailableYet:
                print(' !!! not available yet')
            except (UnknownApiNameOrVersion, HttpError) as api_error:
                print(' !!! ERROR')
                print(api_error)

                existing_retry = None

                if self.mongodb.has_collection(
                        GoogleSearchConsole.COLLECTION_NAME_RETRY):
                    existing_retry = self.mongodb.find_one(
                        GoogleSearchConsole.COLLECTION_NAME_RETRY, {
                            'property':
                            import_property['property'],
                            'requestDate':
                            datetime.combine(import_property['requestDate'],
                                             datetime.min.time()),
                        })

                if existing_retry is None:
                    self.mongodb.insert_document(
                        GoogleSearchConsole.COLLECTION_NAME_RETRY, {
                            'module':
                            'aggregation',
                            'credentials':
                            import_property['credentials'],
                            'property':
                            import_property['property'],
                            'requestDate':
                            datetime.combine(import_property['requestDate'],
                                             datetime.min.time()),
                            'dimensions':
                            import_property['dimensions'],
                            'searchTypes':
                            import_property['searchTypes'],
                            'previousData':
                            import_property['previousData'],
                            'aggregationType':
                            import_property['aggregationType'],
                            'database':
                            import_property['database'],
                            'tableName':
                            import_property['tableName'],
                            'datasetName':
                            import_property['datasetName'],
                        })

        print('\ncompleted: {:s}'.format(
            str(timedelta(seconds=int(time() - timer_run)))))
Exemplo n.º 12
0
    def _process_configuration(self, configuration: dict, database: str):
        parameters = {}
        dataset = None
        table_reference = None

        if 'apiKey' in configuration and type(configuration['apiKey']) is str:
            api_key = configuration['apiKey']
        else:
            raise ConfigurationMissingError(
                'Missing API Key for configuration')

        if 'projects' in configuration and type(
                configuration['projects']) is list:
            projects = configuration['projects']
        else:
            raise ConfigurationMissingError(
                'Missing project for configuration')

        if 'method' in configuration and type(configuration['method']) is str:
            method = configuration['method']

            if not method.startswith('optimizer.'):
                method = 'optimizer.' + configuration['method']

            if SistrixApiClient.ENDPOINT_OPTIMIZER_VISIBILITY == method:
                method = SistrixApiClient.ENDPOINT_OPTIMIZER_VISIBILITY
                schema = (
                    SchemaField('request_date', SqlTypeNames.DATETIME,
                                'REQUIRED'),
                    SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'),
                    SchemaField('source', SqlTypeNames.STRING, 'REQUIRED'),
                    SchemaField('type', SqlTypeNames.STRING, 'REQUIRED'),
                    SchemaField('value', SqlTypeNames.FLOAT, 'REQUIRED'),
                )
            elif SistrixApiClient.ENDPOINT_OPTIMIZER_RANKING == method:
                method = SistrixApiClient.ENDPOINT_OPTIMIZER_RANKING
                schema = (
                    SchemaField('request_date', SqlTypeNames.DATETIME,
                                'REQUIRED'),
                    SchemaField('keyword', SqlTypeNames.STRING, 'REQUIRED'),
                    SchemaField('position', SqlTypeNames.INTEGER, 'REQUIRED'),
                    SchemaField('positionOverflow', SqlTypeNames.BOOL,
                                'REQUIRED'),
                    SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'),
                    SchemaField('tags', SqlTypeNames.STRING, 'REQUIRED'),
                    SchemaField('device', SqlTypeNames.STRING, 'REQUIRED'),
                    SchemaField('country', SqlTypeNames.STRING, 'REQUIRED'),
                    SchemaField('traffic', SqlTypeNames.INTEGER, 'REQUIRED'),
                    SchemaField('searchengine', SqlTypeNames.STRING,
                                'REQUIRED'),
                )
            else:
                raise ConfigurationInvalidError(
                    'Invalid method "{}" in configuration'.format(
                        configuration['method']))
        else:
            raise ConfigurationMissingError('Missing method for configuration')

        if 'parameters' in configuration and type(
                configuration['parameters']) is dict:
            parameters = configuration['parameters']

        if 'dataset' in configuration and type(
                configuration['dataset']) is str:
            dataset = configuration['dataset']

        if 'bigquery' == database:
            if type(self.bigquery) is not BigQuery:
                self.bigquery = self.connection.bigquery

            if 'table' in configuration and type(
                    configuration['table']) is str:
                table_reference = self.bigquery.table_reference(
                    configuration['table'], dataset)
            else:
                raise ConfigurationMissingError(
                    'You have to set at least a table if you want to use bigquery'
                )

        api_client = SistrixApiClient(api_key)

        responses = []
        request_date = datetime.utcnow().replace(tzinfo=timezone('UTC'))

        request = {
            'date': request_date.astimezone(timezone('Europe/Berlin')),
            **parameters
        }

        for project in projects:
            request['project'] = project

            try:
                if SistrixApiClient.ENDPOINT_OPTIMIZER_VISIBILITY == method:
                    responses.extend(
                        self._process_visibility_response(
                            api_client.request(method, request), request_date))
                elif SistrixApiClient.ENDPOINT_OPTIMIZER_RANKING == method:
                    if 'limit' not in request:
                        request['limit'] = self.DEFAULT_API_RANKING_LIMIT

                    responses.extend(
                        self._process_ranking_response(
                            api_client.request(method, request), request_date))
            except SistrixApiError as error:
                print('API Error: ' + error.message)

        if 'bigquery' == self.module_configuration.database:
            self._process_responses_for_bigquery(responses, schema,
                                                 table_reference)
        else:
            self._process_responses_for_mongodb(responses)
Exemplo n.º 13
0
    def _process_request_configuration(self, configuration: dict,
                                       database: str):
        api_key = ''
        domain = None
        host = None
        paths = None
        urls = None
        daily = False
        on_weekday = None
        add_parameters_to_result = False
        methods = []
        dataset = None
        table_reference = None
        request_date = datetime.now().date()
        requests = []

        if 'apiKey' in configuration and type(configuration['apiKey']) is str:
            api_key = configuration['apiKey']

        if 'domain' in configuration and type(configuration['domain']) is str:
            domain = configuration['domain']

        if 'host' in configuration and type(configuration['host']) is str:
            host = configuration['host']

        if 'paths' in configuration and type(configuration['paths']) is list:
            paths = configuration['paths']

        if 'urls' in configuration and type(configuration['urls']) is list:
            urls = configuration['urls']

        if 'onlyOnWeekday' in configuration and (
                type(configuration['onlyOnWeekday']) is str
                or type(configuration['onlyOnWeekday']) is int):
            on_weekday = configuration['onlyOnWeekday']
        else:
            daily = True

        if 'addParametersToResult' in configuration and type(
                configuration['addParametersToResult']) is bool:
            add_parameters_to_result = configuration['addParametersToResult']

        if 'methods' in configuration and type(
                configuration['methods']) is list:
            for method in configuration['methods']:
                if 'method' not in method and type(
                        method['method']) is not str:
                    raise ConfigurationMissingError('Missing api method')
                elif not method['method'].startswith('domain.'):
                    method['method'] = 'domain.' + method['method']
                if method[
                        'method'] not in SistrixDomain.METHODS_PARAMETERS_ALLOWED.keys(
                        ):
                    raise ConfigurationInvalidError(
                        'The method "{}" is not allowed'.format(
                            method['method']))
                if 'fieldName' not in method and type(
                        method['fieldName']) is not str:
                    raise ConfigurationMissingError(
                        'Missing a field name in api method')
                if 'parameters' not in method:
                    method['parameters'] = {}
                elif type(method['parameters']) is not dict:
                    raise ConfigurationInvalidError(
                        'Method parameters must be type of dictionary')
                for parameter in method['parameters']:
                    if parameter not in SistrixDomain.METHODS_PARAMETERS_ALLOWED[
                            method['method']]:
                        raise ConfigurationInvalidError(
                            'The parameter "{}" for "{}" is not allowed in this module'
                            .format(parameter, method['method']))

                if method['method'] in SistrixDomain.DAILY_PARAMETER_ALLOWED:
                    method['parameters']['daily'] = daily

                methods.append(method)

        if 0 == len(methods):
            raise ConfigurationMissingError('Missing methods to request')

        if 'dataset' in configuration and type(
                configuration['dataset']) is str:
            dataset = configuration['dataset']

        if 'bigquery' == database:
            if type(self.bigquery) is not BigQuery:
                self.bigquery = self.connection.bigquery

            if 'table' in configuration and type(
                    configuration['table']) is str:
                table_reference = self.bigquery.table_reference(
                    configuration['table'], dataset)
            else:
                raise ConfigurationMissingError(
                    'You have to set at least a table if you want to use bigquery'
                )

        if (domain is not None and (host is not None or paths is not None or urls is not None)) or \
            (host is not None and (domain is not None or paths is not None or urls is not None)) or \
            (paths is not None and (host is not None or domain is not None or urls is not None)) or \
            (urls is not None and (host is not None or paths is not None or domain is not None)):
            raise ConfigurationInvalidError(
                'You can\'t use domain, host, path or url parallel to each other'
            )

        if domain is None and host is None and paths is None and urls is None:
            raise ConfigurationInvalidError(
                'You need one of these parameters: "domain, host, path, url"')

        if on_weekday is not None and (
                # weekday format may get influnced by locale
                on_weekday != '{:%a}'.format(datetime.now())
                and on_weekday != '{:%A}'.format(datetime.now())
                and on_weekday != datetime.now().isoweekday()):
            return

        if domain is not None:
            requests.append({'domain': domain})

        if host is not None:
            requests.append({'host': host})

        if paths is not None:
            for path in paths:
                requests.append({'path': path})

        if urls is not None:
            for url in urls:
                requests.append({'url': url})

        responses = []

        sistrix_api_client = SistrixApiClient(api_key)

        for request in requests:
            for key, value in request.items():
                response_row = {}

                for method in methods:
                    if 'bigquery' == database:
                        if self._bigquery_check_has_existing_data(
                                table_reference, request_date,
                                add_parameters_to_result,
                                method['parameters']):
                            continue
                    else:
                        if self._mongodb_check_has_existing_data(
                                request_date, method['parameters']):
                            continue

                    response_row = self._sistrix_api_requests(
                        sistrix_api_client, method, response_row,
                        add_parameters_to_result, **{key: value})

                    if add_parameters_to_result:
                        responses.append({
                            key: value,
                            'date': request_date,
                            **response_row
                        })

                if not add_parameters_to_result:
                    responses.append({
                        key: value,
                        'date': request_date,
                        **response_row
                    })

        if 0 < len(responses):
            if table_reference is None and 'mongodb' == database:
                self.mongodb.insert_documents(SistrixDomain.COLLECTION_NAME,
                                              responses)
            elif 'bigquery' == database:
                self._process_response_rows_for_bigquery(
                    responses, methods, table_reference)
            else:
                ConfigurationInvalidError(
                    'Invalid database configuration for this module')
Exemplo n.º 14
0
    def _process_configuration(self, configuration: dict, database: str):
        credentials = None
        dimensions = None
        metrics = None
        segment_id = None
        dimension_filter_clauses = {}
        metric_filter_clauses = {}
        table_reference = None

        if 'credentials' in configuration and type(
                configuration['credentials']) is str:
            credentials = service_account.Credentials.from_service_account_file(
                abspath(configuration['credentials']),
                scopes=['https://www.googleapis.com/auth/analytics.readonly'])

        logging.getLogger('googleapiclient.discovery').setLevel(logging.ERROR)
        self.api_service = build('analyticsreporting',
                                 'v4',
                                 credentials=credentials,
                                 cache_discovery=False)

        if 'bigquery' == database:
            dataset = None

            if type(self.bigquery) is not BigQuery:
                self.bigquery = self.connection.bigquery

            if 'dataset' in configuration and type(
                    configuration['dataset']) is str:
                dataset = configuration['dataset']

            if 'tablename' in configuration and type(
                    configuration['tablename']) is str:
                table_reference = self.bigquery.table_reference(
                    configuration['tablename'], dataset)
            else:
                raise ConfigurationMissingError(
                    'You have to set at least a table if you want to use bigquery'
                )

        if 'dimensions' in configuration and type(
                configuration['dimensions']) is list:
            dimensions = configuration['dimensions']

        if 'metrics' in configuration and type(
                configuration['metrics']) is list:
            metrics = configuration['metrics']

        if 'dateDaysAgo' in configuration and type(
                configuration['dateDaysAgo']) is int:
            request_date = date.today() - timedelta(
                days=configuration['dateDaysAgo'])
        else:
            request_date = date.today() - timedelta(
                days=GoogleAnalytics.DEFAULT_DAYS_AGO)

        if 'segmentId' in configuration and (
                type(configuration['segmentId']) is str
                or type(configuration['segmentId']) is int
                or type(configuration['segmentId']) is float):
            segment_id = str(configuration['segmentId'])

        if 'dimensionFilterClauses' in configuration and type(
                configuration['dimensionFilterClauses']) is dict:
            dimension_filter_clauses = configuration['dimensionFilterClauses']

        if 'metricFilterClauses' in configuration and type(
                configuration['metricFilterClauses']) is dict:
            metric_filter_clauses = configuration['metricFilterClauses']

        if 'views' in configuration and type(configuration['views']) is list:
            for view in configuration['views']:
                try:
                    self._import_view(int(view), dimensions, metrics,
                                      segment_id, dimension_filter_clauses,
                                      metric_filter_clauses, request_date,
                                      database, table_reference)

                    print(' - OK')
                    sleep(10)
                except _DataAlreadyExistError:
                    print(' - EXISTS')
Exemplo n.º 15
0
    def _process_configurations(
            self,
            configurations: list,
            clusters: dict,
            database: str,
            table_reference: TableReference
    ):
        data = []

        for configuration in configurations:
            operation = None
            operation_options = None

            if 'query' in configuration and type(configuration['query']) is str:
                query = configuration['query']
            else:
                raise ConfigurationMissingError('Missing xpath query for configuration')

            if 'operation' in configuration and type(configuration['operation']) is str:
                operation = configuration['operation'].lower()

                if operation not in self.SUPPORTED_OPERATIONS:
                    raise ConfigurationInvalidError('Invalid operation for xpath configuration')

            if 'operationOptions' in configuration and type(configuration['operationOptions']) is dict:
                operation_options = configuration['operationOptions']

            if 'name' in configuration and type(configuration['name']) is str:
                name = configuration['name']
            else:
                raise ConfigurationMissingError('Missing xpath name for configuration')

            if 'url' in configuration and type(configuration['url']) is str:
                if not Validator.validate_url(configuration['url']):
                    raise ConfigurationInvalidError('Invalid url in xpath configuartion')

                data.append({
                    'url': configuration['url'],
                    'query': query,
                    'name': name,
                    'cluster': None,
                    'date': datetime.utcnow(),
                    'elements': self._run_operation_on_elements(
                        self._xpath_query_on_html(
                            self._get_html_from_url(configuration['url']),
                            query
                        ),
                        operation,
                        operation_options
                    )
                })

            elif 'cluster' in configuration:
                clusters_configuration = None

                if type(configuration['cluster']) is dict:
                    clusters_configuration = configuration['cluster']
                elif type(configuration['cluster']) is str:
                    if configuration['cluster'] in clusters:
                        clusters_configuration = clusters[configuration['cluster']]
                    else:
                        cluster, subcluster = configuration['cluster'].split(sep=Xpath.DEFAULT_MATCH_SEPERATOR)

                        if cluster in clusters and subcluster in clusters[cluster]:
                            clusters_configuration = {subcluster: clusters[cluster][subcluster]}

                if type(clusters_configuration) is not dict:
                    raise ConfigurationMissingError('Missing cluster configuration')

                for cluster, urls in clusters_configuration.items():
                    for url in urls:
                        if type(url) is not str:
                            raise ConfigurationInvalidError('Invalid url')
                        elif not Validator.validate_url(url):
                            raise ConfigurationInvalidError('Invalid url')

                        data.append({
                            'url': url,
                            'query': query,
                            'name': name,
                            'cluster': cluster,
                            'date': datetime.utcnow(),
                            'elements': self._run_operation_on_elements(
                                self._xpath_query_on_html(
                                    self._get_html_from_url(url),
                                    query
                                ),
                                operation,
                                operation_options
                            )
                        })
            else:
                raise ConfigurationMissingError('Missing url parameter for xpath configuration')

        if 'bigquery' == database:
            self._process_responses_for_bigquery(data, table_reference)
        else:
            self._process_responses_for_mongodb(data)