Exemplo n.º 1
0
    def test_add_twitter_resource_processor_nonentity(self, mock_api,
                                                      mock_auth):
        '''Test twitter processor handles non-entities properly (neither a user
        nor hashtag).'''

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'entity': 'non-entity',
            'project_id': 'my-project'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_twitter_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        # error_msg = 'Entity, "non-entity", must be an @account or a #hashtag'
        error_msg = 'Entity, "non-entity", must be an @account, #hashtag, ' \
                    'or url:url-search'
        with self.assertRaises(ValueError) as cm:
            mock_processor_test(processor_path, (params, datapackage, []))

        self.assertEqual(str(cm.exception), error_msg)
    def test_add_facebook_resource_processor_page_no_data(self, mock_api):
        '''Test facebook processor raises exception is no data is returned from
        api.'''

        mock_api.return_value = my_mock_api_no_response

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {'entity': 'MyPage', 'project_id': 'my-project'}

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_facebook_resource.py')

        # Trigger the processor with our mock `ingest` will return an exception
        error_msg = 'Facebook request returned no data.'
        with self.assertRaises(ValueError) as cm:
            mock_processor_test(processor_path, (params, datapackage, []))

        self.assertEqual(str(cm.exception), error_msg)
    def test_add_facebook_resource_processor_nopagetoken(self):
        '''Test facebook processor handles when no page token is available.'''

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {'entity': 'NoPage', 'project_id': 'my-project'}

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_facebook_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        error_msg = '\'No Facebook Page Access Token found for page ' \
            '"NoPage" in settings\''
        with self.assertRaises(KeyError) as cm:
            mock_processor_test(processor_path, (params, datapackage, []))

        self.assertEqual(str(cm.exception), error_msg)
    def test_add_ckan_resource_processor_api_key(self, mock_request):

        mock_request.get('https://demo.ckan.org/api/3/action/resource_show',
                         json=MOCK_CKAN_RESPONSE)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'ckan-host': 'https://demo.ckan.org',
            'resource-id': 'd51c9bd4-8256-4289-bdd7-962f8572efb0',
            'ckan-api-key': 'my-api-key'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_ckan.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_ckan_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, []))

        # test request contained a Authorization header with our api key
        request_history = mock_request.request_history
        assert request_history[0].headers['Authorization'] == 'my-api-key'
Exemplo n.º 5
0
    def test_add_rubygems_resource_not_json(self, mock_request):
        # Mock API responses
        mock_request.get('https://rubygems.org/api/v1/gems/mygem404.json',
                         text="This is not json.")

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []  # nothing here
        }
        params = {'gem_id': 'mygem404'}

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_rubygems_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, iter([])))

        # Trigger the processor with our mock `ingest` will return an exception
        with self.assertRaises(simplejson.scanner.JSONDecodeError):
            spew_res_iter = spew_args[1]
            # attempt access to spew_res_iter raises exception
            list(spew_res_iter)
Exemplo n.º 6
0
    def test_add_rubygems_resource_bad_status(self, mock_request):
        # Mock API responses
        error_msg = 'Hi, there was a problem with your request.'
        mock_request.get('https://rubygems.org/api/v1/gems/mygem401.json',
                         text=error_msg,
                         status_code=401)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []  # nothing here
        }
        params = {'gem_id': 'mygem401'}

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_rubygems_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, iter([])))

        # Trigger the processor with our mock `ingest` will return an exception
        with self.assertRaises(Exception) as cm:
            spew_res_iter = spew_args[1]
            # attempt access to spew_res_iter raises exception
            list(spew_res_iter)
        self.assertEqual(str(cm.exception), error_msg)
Exemplo n.º 7
0
    def test_add_discourse_resource_not_json(self, m):
        '''Response isn't json error.'''

        # Mock API responses
        m.get('https://discourse.example.com/admin/users/list/active.json',
              text="bad response",
              status_code=200)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []  # nothing here
        }
        params = {'domain': 'discourse.example.com'}

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_discourse_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, iter([])))

        # Trigger the processor with our mock `ingest` will return an exception
        with self.assertRaises(ValueError) as cm:
            spew_res_iter = spew_args[1]
            # attempt access to spew_res_iter raises exception
            list(spew_res_iter)
        error_msg = "Expected JSON in response from: https://discourse.example.com/admin/users/list/active.json?api_key=myfakediscoursetoken&page=1"  # noqa
        self.assertEqual(str(cm.exception), error_msg)
Exemplo n.º 8
0
    def test_add_github_resource_processor_badstatus(self, mock_request):
        '''Github response returns bad status'''

        # mock the github response
        mock_repo_response = {
          "message": "API rate limit exceeded for user.",
          "documentation_url": "https://developer.github.com/v3/#rate-limiting"
        }
        mock_request.get('https://api.github.com/repos/org/my_github_repo',
                         json=mock_repo_response, status_code=403)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'name': 'hello',
            'repo': 'org/my_github_repo'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_github_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        with self.assertRaises(RuntimeError):
            spew_args, _ = mock_processor_test(processor_path,
                                               (params, datapackage, []))
Exemplo n.º 9
0
    def test_add_github_resource_processor_notjson(self, mock_request):
        '''Github response isn't json'''

        # mock the github response
        mock_repo_response = "Hi"
        mock_request.get('https://api.github.com/repos/org/my_github_repo',
                         text=mock_repo_response)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'name': 'hello',
            'repo': 'org/my_github_repo'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_github_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        with self.assertRaises(simplejson.scanner.JSONDecodeError):
            spew_args, _ = mock_processor_test(processor_path,
                                               (params, datapackage, []))
    def test_validate_processor_valid_resource(self):

        # input arguments used by our mock `ingest`
        datapackage = {
            'name':
            'my-datapackage',
            'project':
            'my-project',
            'resources': [{
                'name': 'my-resource',
                'schema': {
                    'fields': [{
                        'name': 'id',
                        'type': 'integer'
                    }, {
                        'name': 'name',
                        'type': 'string'
                    }]
                }
            }]
        }

        report_dir = '{}'.format(self.get_base_path())

        params = {
            'reports_path': report_dir,
            'datapackage_reports_path': 'reports',
        }

        def row_yielder():
            yield {'id': 1, 'name': 'english'}
            yield {'id': 2, 'name': 'german'}

        # Path to the processor we want to test
        processor_dir = os.path.dirname(
            datapackage_pipelines_goodtables.processors.__file__)
        processor_path = os.path.join(processor_dir, 'validate.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(
            processor_path, (params, datapackage, iter([row_yielder()])))

        spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        spew_res_iter_contents = list(spew_res_iter)
        # listing rows in resource will trigger validation
        list(spew_res_iter_contents[0])

        # Asserts for the datapackage
        reports = spew_dp['reports']
        assert len(reports) == 1
        assert reports[0]['resource'] == 'my-resource'
        assert reports[0]['reportType'] == 'goodtables'
        assert reports[0]['path'] == 'reports/my-resource.json'

        with io.open('{}/my-resource.json'.format(report_dir), 'r') as f:
            report_json = json.loads(f.read())
            assert report_json['valid'] is True
    def test_validate_processor_no_resources_with_params(self):

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {'fail_on_error': True, 'datapackage_reports_path': 'reports'}

        # Path to the processor we want to test
        processor_dir = os.path.dirname(
            datapackage_pipelines_goodtables.processors.__file__)
        processor_path = os.path.join(processor_dir, 'validate.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, []))

        spew_dp = spew_args[0]

        # Asserts for the datapackage
        dp_resources = spew_dp['resources']
        # No resources
        assert len(dp_resources) == 0
        assert len(spew_dp['reports']) == 0
    def test_add_ckan_resource_processor_misc_error(self, mock_request):

        mock_request.get('https://demo.ckan.org/api/3/action/resource_show',
                         json=MOCK_CKAN_ERROR)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'ckan-host': 'https://demo.ckan.org',
            'resource-id': 'd51c9bd4-8256-4289-bdd7-962f8572efb0'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_ckan.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_ckan_resource.py')

        # Trigger the processor with our mock `ingest` will return an exception
        with self.assertRaises(Exception):
            spew_args, _ = mock_processor_test(processor_path,
                                               (params, datapackage, []))
    def test_achanges_acl_handles_non_existing_keys(self):
        # Should be in setup but requires mock
        s3 = boto3.client('s3', endpoint_url=os.environ['S3_ENDPOINT_URL'])
        bucket = 'my.private.bucket'
        try:
            s3.create_bucket(ACL='public-read', Bucket=bucket)
        except:
            pass

        params = {
            'bucket': bucket,
            'path': 'my/non-existing/datasets',
            'acl': 'private'
        }

        processor_dir = os.path.dirname(
            datapackage_pipelines_aws.processors.__file__)
        processor_path = os.path.join(processor_dir, 'change_acl.py')
        spew_args, _ = mock_processor_test(processor_path, (params, {
            'name': 'test',
            'resources': []
        }, [[]]))
        dp = spew_args[0]
        # Just make sure processor executed without errors (must not do anything)
        self.assertEqual(dp['name'], 'test')
Exemplo n.º 14
0
    def test_add_github_resource_processor(self, mock_request):
        # mock the github response
        mock_github_response = {
            'name': 'my-repository',
            'subscribers_count': 4,
            'stargazers_count': 1
        }
        mock_request.get(requests_mock.ANY, json=mock_github_response)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'name': 'hello',
            'repo': 'my_github_repo',
            'map_fields': {
                'repository': 'name',
                'watchers': 'subscribers_count',
                'stars': 'stargazers_count'
            }
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_github_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, []))

        spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # Asserts for the datapackage
        dp_resources = spew_dp['resources']
        assert len(dp_resources) == 1
        assert dp_resources[0]['name'] == 'hello'
        field_names = \
            [field['name'] for field in dp_resources[0]['schema']['fields']]
        assert field_names == [
            'repository', 'watchers', 'stars', 'source', 'date'
        ]

        # Asserts for the res_iter
        spew_res_iter_contents = list(spew_res_iter)
        assert len(spew_res_iter_contents) == 1
        assert list(spew_res_iter_contents[0]) == \
            [{
                'repository': 'my-repository',
                'watchers': 4,
                'stars': 1,
                'source': 'github',
                'date': datetime.date.today()
            }]
    def test_validate_processor_invalid_resource_fail_on_error_no_report(self):
        '''Fail on error but don't write report.'''
        # input arguments used by our mock `ingest`
        datapackage = {
            'name':
            'my-datapackage',
            'project':
            'my-project',
            'resources': [{
                'name': 'resource-fail-no-write',
                'schema': {
                    'fields': [{
                        'name': 'id',
                        'type': 'string'
                    }, {
                        'name': 'name',
                        'type': 'string'
                    }]
                }
            }]
        }
        report_dir = '{}'.format(self.get_base_path())

        params = {
            'reports_path': report_dir,
            'fail_on_error': True,
            'write_report': False
        }

        def row_yielder():
            # id not a string causes goodtables error
            yield {'id': 1, 'name': 'english'}

        # Path to the processor we want to test
        processor_dir = os.path.dirname(
            datapackage_pipelines_goodtables.processors.__file__)
        processor_path = os.path.join(processor_dir, 'validate.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(
            processor_path, (params, datapackage, iter([row_yielder()])))

        spew_res_iter = spew_args[1]

        # Asserts for the res_iter
        spew_res_iter_contents = list(spew_res_iter)
        error_msg = 'Datapackage resource \'resource-fail-no-write\' ' \
                    'failed Goodtables validation.'
        # listing rows in resource will trigger validation
        with self.assertRaises(RuntimeError) as cm:
            list(spew_res_iter_contents[0])
        self.assertEqual(str(cm.exception), error_msg)

        assert not os.path.isfile(
            '{}/resource-fail-no-write.json'.format(report_dir))
Exemplo n.º 16
0
    def test_add_rubygems_resource_processor(self, mock_request):
        '''No latest in database. Get today's data.'''
        # mock the rubygems response
        mock_rubygems_response = {
            'name': 'mygem',
            'downloads': 271,
            'version': '0.3.1',
            'version_downloads': 170
        }
        mock_request.get('https://rubygems.org/api/v1/gems/mygem.json',
                         json=mock_rubygems_response)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {'gem_id': 'mygem'}

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_rubygems_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, []))

        spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # Asserts for the datapackage
        dp_resources = spew_dp['resources']
        assert len(dp_resources) == 1
        assert dp_resources[0]['name'] == 'mygem'
        field_names = \
            [field['name'] for field in dp_resources[0]['schema']['fields']]
        assert field_names == [
            'source', 'date', 'package', 'downloads', 'total_downloads'
        ]

        # Asserts for the res_iter
        spew_res_iter_contents = list(spew_res_iter)
        assert len(spew_res_iter_contents) == 1
        assert list(spew_res_iter_contents[0]) == \
            [{
                'package': 'mygem',
                'source': 'rubygems',
                'total_downloads': 271,
                'date': datetime.date.today()
            }]
Exemplo n.º 17
0
    def test_add_twitter_resource_processor_account(
        self,
        mock_api,
        mock_auth,
        mock_cursor,
    ):
        '''Test twitter processor handles user account (@myuser) properties.'''

        # mock the twitter api response
        mock_auth.return_value = 'authed'
        mock_api.return_value = my_mock_api
        mock_cursor.return_value.items.side_effect = [
            get_cursor_items_iter(my_mock_api.search()),
            get_cursor_items_iter(my_mock_api.user_timeline())
        ]

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'entity': '@myuser',
            'project_id': 'my-project'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_twitter_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = \
            mock_processor_test(processor_path, (params, datapackage, []))

        spew_res_iter = spew_args[1]

        # Asserts for the res_iter
        spew_res_iter_contents = list(spew_res_iter)

        # One rows in first resource
        assert len(spew_res_iter_contents[0]) == 1

        # Get row from resource
        first_row = list(spew_res_iter_contents[0])[0]
        # followers is updated from api
        assert first_row['date'] == \
            datetime.date.today() - datetime.timedelta(days=1)
        # the others are updated from today's stored result
        assert first_row['mentions'] == 2
        assert first_row['interactions'] == 15
        assert first_row['followers'] == 5
Exemplo n.º 18
0
    def test_add_ga_resource_processor_no_latest(self):
        '''No latest in db, so populate from GA request.'''

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []  # nothing here
        }
        params = {
            'domain': {
                'url': 'sub.example.com',
                'viewid': '123456'
            }
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_ga_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, iter([])))

        # spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # one resource
        resources = list(spew_res_iter)
        assert len(resources) == 1

        # rows in resource
        rows = list(resources)[0]
        assert len(rows) == 4
        # first row asserts
        assert rows[0] == {
            'date': dateutil.parser.parse('2017-05-15').date(),
            'page_path': '/',
            'visitors': 1,
            'unique_visitors': 1,
            'avg_time_spent': 2,
            'domain': 'sub.example.com',
            'source': 'ga'
        }
        # last row asserts
        assert rows[len(rows)-1]['visitors'] == 55
        assert rows[len(rows)-1]['unique_visitors'] == 89
        assert rows[len(rows)-1]['avg_time_spent'] == 144
        assert rows[len(rows)-1]['date'] == \
            dateutil.parser.parse('2017-05-18').date()
    def test_validate_processor_valid_resource_no_report(self):
        '''Don't write report if `write_report` is False.'''

        # input arguments used by our mock `ingest`
        datapackage = {
            'name':
            'my-datapackage',
            'project':
            'my-project',
            'resources': [{
                'name': 'valid-no-report',
                'schema': {
                    'fields': [{
                        'name': 'id',
                        'type': 'integer'
                    }, {
                        'name': 'name',
                        'type': 'string'
                    }]
                }
            }]
        }

        report_dir = '{}'.format(self.get_base_path())

        params = {'reports_path': report_dir, 'write_report': False}

        def row_yielder():
            yield {'id': 1, 'name': 'english'}
            yield {'id': 2, 'name': 'german'}

        # Path to the processor we want to test
        processor_dir = os.path.dirname(
            datapackage_pipelines_goodtables.processors.__file__)
        processor_path = os.path.join(processor_dir, 'validate.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(
            processor_path, (params, datapackage, iter([row_yielder()])))

        spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        spew_res_iter_contents = list(spew_res_iter)
        # listing rows in resource will trigger validation
        list(spew_res_iter_contents[0])

        # No `reports` property in datapackage
        assert 'reports' not in spew_dp

        assert not os.path.isfile('{}/valid-no-report.json'.format(report_dir))
    def test_puts_datapackage_on_s3(self):
        # Should be in setup but requires mock
        s3 = boto3.resource('s3', endpoint_url=os.environ['S3_ENDPOINT_URL'])
        bucket = s3.Bucket(self.bucket)

        class TempList(list):
            pass

        res = TempList([{
            'Date': datetime.datetime(2001, 2, 3),
            'Name': 'Name'
        }])
        res.spec = self.resources[0]
        res_iter = [res]

        spew_args, _ = mock_processor_test(
            self.processor_path, (self.params, self.datapackage, res_iter))

        spew_res_iter = spew_args[1]
        # We need to actually read the rows to ecexute the iterator(s)
        rows = [list(res) for res in spew_res_iter]

        keys = [key.key for key in bucket.objects.all()]

        dp_path = 'my/test/path/me/my-datapackage/latest/datapackage.json'
        csv_path = 'my/test/path/me/my-datapackage/latest/data/test.csv'
        assert dp_path in keys
        assert csv_path in keys

        # Check datapackage.json content
        dpjson = s3.Object(self.bucket, dp_path).get()
        content = dpjson['Body'].read().decode("utf-8")
        self.assertEquals(json.loads(content)['owner'], 'me')
        self.assertEquals(json.loads(content)['name'], 'my-datapackage')
        self.assertEqual(dpjson['ContentType'], self.params['content_type'])

        # Check csv content
        obj = s3.Object(self.bucket, csv_path).get()
        content = obj['Body'].read().decode("utf-8")
        expected_csv = 'Date,Name\r\n2001-02-03,Name\r\n'
        self.assertEquals(content, expected_csv)
        self.assertEqual(obj['ContentType'], self.params['content_type'])
Exemplo n.º 21
0
    def test_add_facebook_resource_processor_page(self, mock_api):
        '''Test facebook processor handles page properties.
        '''

        mock_api.return_value = my_mock_api

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {'entity': 'MyPage', 'project_id': 'my-project'}

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_facebook_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = \
            mock_processor_test(processor_path, (params, datapackage, []))

        spew_res_iter = spew_args[1]

        # Asserts for the res_iter
        spew_res_iter_contents = list(spew_res_iter)
        # One resource
        assert len(spew_res_iter_contents) == 1
        # With one row
        assert len(spew_res_iter_contents[0]) == 1
        row = list(spew_res_iter_contents[0])[0]
        log.debug(row)
        assert row['followers'] == 16689
        assert row['mentions'] == 13
        assert row['interactions'] == 5
        assert row['impressions'] == 2
        assert row['date'] == \
            datetime.date.today() - datetime.timedelta(days=1)
    def test_add_ckan_resource_processor(self, mock_request):

        mock_request.get('https://demo.ckan.org/api/3/action/resource_show',
                         json=MOCK_CKAN_RESPONSE)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'ckan-host': 'https://demo.ckan.org',
            'resource-id': 'd51c9bd4-8256-4289-bdd7-962f8572efb0'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_ckan.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_ckan_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, []))

        spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # Asserts for the datapackage
        dp_resources = spew_dp['resources']
        assert len(dp_resources) == 1
        assert dp_resources[0]['name'] == 'january-2012'
        assert dp_resources[0]['format'] == 'csv'
        assert dp_resources[0]['dpp:streamedFrom'] == \
            MOCK_CKAN_RESPONSE['result']['url']
        assert 'schema' not in dp_resources[0]

        # Asserts for the res_iter
        spew_res_iter_contents = list(spew_res_iter)
        assert len(spew_res_iter_contents) == 0
Exemplo n.º 23
0
    def test_add_ga_resource_processor_no_row_returned(self, mock_discovery):
        '''No rows returned from GA response.'''

        ga_response = {'reports': [{'data': {}}]}

        mock_discovery \
            .build.return_value \
            .reports.return_value \
            .batchGet.return_value \
            .execute.return_value = ga_response

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {'domain': {'url': 'sub.example.com', 'viewid': '123456'}}

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_ga_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = \
            mock_processor_test(processor_path,
                                (params, datapackage, iter([])))

        # spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # one resource
        resources = list(spew_res_iter)
        assert len(resources) == 1

        # rows in resource
        rows = list(resources)[0]
        assert len(rows) == 0
Exemplo n.º 24
0
    def test_add_mailchimp_resource_bad_status(self, m):
        error_msg = 'Hi, there was a problem with your request.'
        bad_response = {
            'detail': error_msg
        }
        # Mock API responses
        m.get('https://dc1.api.mailchimp.com/3.0/lists/123456',
              json=bad_response, status_code=401)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []  # nothing here
        }
        params = {
            'list_id': '123456'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_mailchimp_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(processor_path,
                                           (params, datapackage, iter([])))

        # Trigger the processor with our mock `ingest` will return an exception
        with self.assertRaises(Exception) as cm:
            spew_res_iter = spew_args[1]
            # attempt access to spew_res_iter raises exception
            list(spew_res_iter)

        self.assertEqual(str(cm.exception), error_msg)
    def test_index(self):
        # Should be in setup but requires mock
        class TempList(list):
            pass

        res = TempList([{
            'Date': datetime.datetime(2001, 2, 3),
            'Name': 'Name'
        }])
        res.spec = self.resources[0]
        res_iter = [res]

        spew_args = mock_processor_test(
            self.processor_path, (self.params, self.datapackage, res_iter))

        spew_res_iter = spew_args[0][1]
        # We need to actually read the rows to execute the iterator(s)
        rows = [list(res) for res in spew_res_iter]

        Elasticsearch().indices.flush()
        records = Elasticsearch().search(index='dummy')
        records = [r['_source'] for r in records['hits']['hits']]

        assert records == [{'Date': '2001-02-03T00:00:00', 'Name': 'Name'}]
Exemplo n.º 26
0
    def test_add_rubygems_resource_processor_latest_yesterday(
            self, mock_request):
        '''Latest was yesterday. Get today's data, and add `downloads`.'''
        # mock the rubygems response
        mock_rubygems_response = {
            'name': 'mygem',
            'downloads': 271,
            'version': '0.3.1',
            'version_downloads': 170
        }
        mock_request.get('https://rubygems.org/api/v1/gems/mygem.json',
                         json=mock_rubygems_response)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name':
            'my-datapackage',
            'project':
            'my-project',
            'resources': [{
                'name': 'latest-project-entries',
                'schema': {
                    'fields': [{
                        'name': 'source',
                        'type': 'string'
                    }, {
                        'name': 'date',
                        'type': 'date'
                    }, {
                        'name': 'package',
                        'type': 'string'
                    }, {
                        'name': 'downloads',
                        'type': 'int'
                    }, {
                        'name': 'total_downloads',
                        'type': 'int'
                    }]
                }
            }]
        }
        params = {'gem_id': 'mygem'}

        # latest is yest
        def latest_entries_res():
            yield {
                'date': datetime.date.today() - datetime.timedelta(days=1),
                'downloads': 7,
                'total_downloads': 265,
                'package': 'mygem',
                'source': 'rubygems'
            }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir,
                                      'add_rubygems_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = mock_processor_test(
            processor_path,
            (params, datapackage, iter([latest_entries_res()])))

        spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # Asserts for the datapackage
        dp_resources = spew_dp['resources']
        assert len(dp_resources) == 2
        assert dp_resources[0]['name'] == 'latest-project-entries'
        assert dp_resources[1]['name'] == 'mygem'
        field_names = \
            [field['name'] for field in dp_resources[0]['schema']['fields']]
        assert field_names == [
            'source', 'date', 'package', 'downloads', 'total_downloads'
        ]

        # Asserts for the res_iter
        spew_res_iter_contents = list(spew_res_iter)
        assert len(spew_res_iter_contents) == 2
        assert list(spew_res_iter_contents[1]) == \
            [{
                'package': 'mygem',
                'source': 'rubygems',
                'downloads': 6,
                'total_downloads': 271,
                'date': datetime.date.today()
            }]
Exemplo n.º 27
0
    def test_add_twitter_resource_processor_url(self, mock_api,
                                                mock_auth, mock_cursor):
        '''Test twitter processor handles url entities (url:<term>).'''
        # mock the twitter api response
        mock_auth.return_value = 'authed'
        mock_api.return_value = my_mock_api
        yesterday = \
            datetime.datetime.now() - datetime.timedelta(days=1)
        yesterdays_statuses = [
            Status('okfnlabs', 1, 5, yesterday),
            Status('anonymous', 3, 0, yesterday),
            Status('anonymous', 3, 8, yesterday)
        ]
        mock_cursor.return_value.items.side_effect = [
            get_cursor_items_iter(yesterdays_statuses)
        ]

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []
        }
        params = {
            'entity': 'url:example.com',
            'project_id': 'my-project'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_twitter_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = \
            mock_processor_test(processor_path,
                                (params, datapackage, []))

        spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # Asserts for the datapackage
        dp_resources = spew_dp['resources']
        assert len(dp_resources) == 1
        assert dp_resources[0]['name'] == 'url-example-com'
        field_names = \
            [field['name'] for field in dp_resources[0]['schema']['fields']]
        assert field_names == ['entity', 'entity_type',
                               'source', 'date', 'mentions',
                               'interactions', 'followers']

        # Asserts for the res_iter
        spew_res_iter_contents = list(spew_res_iter)
        assert len(spew_res_iter_contents[0]) == 1

        first_row = spew_res_iter_contents[0][0]
        assert first_row == \
            {
                'entity': 'url:example.com',
                'entity_type': 'url',
                'source': 'twitter',
                'followers': None,
                'mentions': 3,
                'interactions': 20,
                'date': datetime.date.today() - datetime.timedelta(days=1)
            }
Exemplo n.º 28
0
    def test_add_pypi_resource_processor_no_latest(self, mock_discovery):
        '''No latest in db, so populate from big query request.'''

        bq_response = {
            'rows': [{
                'f': [{
                    'v': 'my_package'
                }, {
                    'v': '2017-05-14'
                }, {
                    'v': '6'
                }]
            }, {
                'f': [{
                    'v': 'my_package'
                }, {
                    'v': '2017-05-15'
                }, {
                    'v': '12'
                }]
            }, {
                'f': [{
                    'v': 'my_package'
                }, {
                    'v': '2017-05-16'
                }, {
                    'v': '24'
                }]
            }]
        }

        mock_discovery \
            .build.return_value \
            .jobs.return_value \
            .query.return_value \
            .execute.return_value = bq_response

        # input arguments used by our mock `ingest`
        datapackage = {
            'name': 'my-datapackage',
            'project': 'my-project',
            'resources': []  # nothing here
        }
        params = {
            'name': 'hello',
            'package': 'my_package',
            'project_id': 'my-project'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_pypi_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = \
            mock_processor_test(processor_path,
                                (params, datapackage, iter([])))

        # spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # one resource
        resources = list(spew_res_iter)
        assert len(resources) == 1

        # rows in resource
        rows = list(resources)[0]
        assert len(rows) == 3
        # first row asserts
        assert rows[0] == {
            'date': dateutil.parser.parse('2017-05-14').date(),
            'downloads': 6,
            'package': 'my_package',
            'source': 'pypi'
        }
        # last row asserts
        assert rows[len(rows) - 1]['downloads'] == 24
        assert rows[len(rows)-1]['date'] == \
            dateutil.parser.parse('2017-05-16').date()
Exemplo n.º 29
0
    def test_add_pypi_resource_processor_latest_week_old(self, mock_discovery):
        '''Latest in db is a week old, so fetch new data.'''

        bq_response = {
            'rows': [{
                'f': [{
                    'v': 'my_package'
                }, {
                    'v': '2017-05-14'
                }, {
                    'v': '6'
                }]
            }, {
                'f': [{
                    'v': 'my_package'
                }, {
                    'v': '2017-05-15'
                }, {
                    'v': '12'
                }]
            }, {
                'f': [{
                    'v': 'my_package'
                }, {
                    'v': '2017-05-16'
                }, {
                    'v': '24'
                }]
            }]
        }

        mock_discovery \
            .build.return_value \
            .jobs.return_value \
            .query.return_value \
            .execute.return_value = bq_response

        # input arguments used by our mock `ingest`
        datapackage = {
            'name':
            'my-datapackage',
            'project':
            'my-project',
            'resources': [{
                'name': 'latest-project-entries',
                'schema': {
                    'fields': [
                        {
                            'name': 'date',
                            'type': 'date'
                        },
                        {
                            'name': 'downloads',
                            'type': 'int'
                        },
                        {
                            'name': 'package',
                            'type': 'string'
                        },
                        {
                            'name': 'source',
                            'type': 'string'
                        },
                    ]
                }
            }]
        }
        params = {
            'name': 'hello',
            'package': 'my_package',
            'project_id': 'my-project'
        }

        def latest_entries_res():
            yield {
                'date': dateutil.parser.parse('2017-05-13').date(),
                'downloads': 3,
                'package': 'my_package',
                'source': 'pypi'
            }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_pypi_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = \
            mock_processor_test(processor_path,
                                (params, datapackage,
                                 iter([latest_entries_res()])))

        # spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # two resources
        resources = list(spew_res_iter)
        assert len(resources) == 2

        # first resource will look like latest_entries_res
        assert list(resources[0])[0] == next(latest_entries_res())

        # rows in resource
        rows = list(resources)[1]
        assert len(rows) == 3
        # first row asserts
        assert rows[0] == {
            'date': dateutil.parser.parse('2017-05-14').date(),
            'downloads': 6,
            'package': 'my_package',
            'source': 'pypi'
        }
        # last row asserts
        assert rows[len(rows) - 1]['downloads'] == 24
        assert rows[len(rows)-1]['date'] == \
            dateutil.parser.parse('2017-05-16').date()
Exemplo n.º 30
0
    def test_add_npm_resource_processor_empty_latest(self, mock_request):
        '''latest-project-entries is present, but empty.'''

        day_range = 5
        now = datetime.datetime.now()
        # package created five days ago
        created = now - datetime.timedelta(days=day_range)
        created = created.strftime("%Y-%m-%d")
        mock_registry = {'time': {'created': created}}
        mock_api_responses = []
        for day in reversed(range(1, day_range + 1)):
            start = now - datetime.timedelta(days=day)
            start = start.strftime("%Y-%m-%d")
            mock_api_responses.append({
                'json': {
                    'downloads': day,
                    'start': start,
                    'end': start,
                    'package': 'my_package'
                },
                'status_code': 200
            })
        mock_request.get('https://registry.npmjs.org/my_package',
                         json=mock_registry)
        matcher = re.compile('api.npmjs.org/downloads/point/')
        mock_request.get(matcher, mock_api_responses)

        # input arguments used by our mock `ingest`
        datapackage = {
            'name':
            'my-datapackage',
            'project':
            'my-project',
            'resources': [{
                'name': 'latest-project-entries',
                'schema': {
                    'fields': [
                        {
                            'name': 'date',
                            'type': 'date'
                        },
                        {
                            'name': 'downloads',
                            'type': 'int'
                        },
                        {
                            'name': 'package',
                            'type': 'string'
                        },
                        {
                            'name': 'source',
                            'type': 'string'
                        },
                    ]
                }
            }]
        }
        params = {
            'name': 'hello',
            'package': 'my_package',
            'project_id': 'my-project'
        }

        # Path to the processor we want to test
        processor_dir = \
            os.path.dirname(datapackage_pipelines_measure.processors.__file__)
        processor_path = os.path.join(processor_dir, 'add_npm_resource.py')

        # Trigger the processor with our mock `ingest` and capture what it will
        # returned to `spew`.
        spew_args, _ = \
            mock_processor_test(processor_path,
                                (params, datapackage, iter([{}])))

        # spew_dp = spew_args[0]
        spew_res_iter = spew_args[1]

        # two resources
        resources = list(spew_res_iter)
        assert len(resources) == 2

        # No rows in first resource
        assert len(list(resources[0])) == 0

        # rows in second resource
        rows = list(resources)[1]
        assert len(rows) == 5
        # row asserts
        assert rows[0] == {
            'date': datetime.date.today() - datetime.timedelta(days=5),
            'downloads': 5,
            'package': 'my_package',
            'source': 'npm'
        }
        assert rows[len(rows) - 1]['downloads'] == 1
        assert rows[len(rows)-1]['date'] == \
            datetime.date.today() - datetime.timedelta(days=1)