def handle(self, *args, **options): if not args: raise CommandError(self.args) for arg in args: if arg.isdigit(): event = Event.objects.get(pk=arg) else: if '://' in arg: slug = urlparse(arg).path.split('/')[1] else: slug = arg event = Event.objects.get(slug=slug) print scrape_urls(get_urls(event.additional_links))['text']
def test_get_content_intranet(self, rget): def mocked_get(url, **options): assert options['auth'] == ('foo', 'bar') return Response(SAMPLE_INTRANET_HTML) rget.side_effect = mocked_get url = 'https://intranet.mozilla.org/path' scrape_credentials = { ('foo', 'bar'): ['intranet.mozilla.org'], } with self.settings(SCRAPE_CREDENTIALS=scrape_credentials): content, status = scraper.get_content_intranet(url) eq_(status, 200) eq_(content, 'H2 Title\nTest Content') # or use the scrape_url() result = scraper.scrape_urls([url]) eq_(result['text'], 'H2 Title\nTest Content') eq_(result['results'][0], { 'worked': True, 'status': 200, 'url': url }) with self.settings(SCRAPE_CREDENTIALS={}): content, status = scraper.get_content_intranet(url) eq_(status, 'No credentials set up for intranet.mozilla.org') eq_(content, None)
def test_get_content_readability(self, rget): def mocked_get(url): assert 'abc123' in url return Response({'content': '<p>Test content</p>'}) rget.side_effect = mocked_get url = 'http://doesnotexist/path' with self.settings(READABILITY_PARSER_KEY='abc123'): content, status = scraper.get_content_readability(url) eq_(content, 'Test content') eq_(status, 200) # or use the scrape_url() result = scraper.scrape_urls([url]) eq_(result['text'], 'Test content') eq_(result['results'][0], { 'worked': True, 'status': 200, 'url': url }) with self.settings(READABILITY_PARSER_KEY=None): content, status = scraper.get_content_readability(url) eq_(content, None) eq_(status, 'No READABILITY_PARSER_KEY setting set up')
def test_get_content_readability(self, rget): def mocked_get(url): assert 'abc123' in url return Response({ 'content': '<p>Test content</p>' }) rget.side_effect = mocked_get url = 'http://doesnotexist/path' with self.settings(READABILITY_PARSER_KEY='abc123'): content, status = scraper.get_content_readability(url) eq_(content, 'Test content') eq_(status, 200) # or use the scrape_url() result = scraper.scrape_urls([url]) eq_(result['text'], 'Test content') eq_(result['results'][0], { 'worked': True, 'status': 200, 'url': url }) with self.settings(READABILITY_PARSER_KEY=None): content, status = scraper.get_content_readability(url) eq_(content, None) eq_(status, 'No READABILITY_PARSER_KEY setting set up')
def test_get_content_etherpad(self, rget): def mocked_get(url, **options): eq_( url, 'https://etherpad.mozilla.org/ep/pad/export/foo-bar/latest?' 'format=txt' ) return Response('Content here') rget.side_effect = mocked_get url = 'http://etherpad.mozilla.org/foo-bar' content, status = scraper.get_content_etherpad(url) eq_(status, 200) eq_(content, 'Content here') # or use the scrape_url() result = scraper.scrape_urls([url]) eq_(result['text'], 'Content here') eq_(result['results'][0], { 'worked': True, 'status': 200, 'url': url })
def test_get_content_readability(self, mocked_parser_client): parser = mock.Mock() def mocked_get_article_content(url): return _Parsed('<p>Test content</p>') parser.get_article_content = mocked_get_article_content mocked_parser_client.return_value = parser url = 'http://doesnotexist/path' with self.settings(READABILITY_PARSER_KEY='abc123'): content, status = scraper.get_content_readability(url) eq_(content, 'Test content') eq_(status, 200) # or use the scrape_url() result = scraper.scrape_urls([url]) eq_(result['text'], 'Test content') eq_(result['results'][0], { 'worked': True, 'status': 200, 'url': url }) with self.settings(READABILITY_PARSER_KEY=None): content, status = scraper.get_content_readability(url) eq_(content, None) eq_(status, 'No READABILITY_PARSER_KEY setting set up')
def event_transcript(request, id): event = get_object_or_404(Event, id=id) context = {} from airmozilla.manage.scraper import get_urls, scrape_urls scrapeable_urls = list(get_urls(event.additional_links)) if request.method == 'POST': form = forms.EventTranscriptForm( instance=event, data=request.POST, ) if form.is_valid(): form.save() messages.success( request, 'Event transcript saved.' ) return redirect('manage:event_edit', event.pk) else: initial = {} if request.GET.getlist('urls'): response = scrape_urls(request.GET.getlist('urls')) if response['text']: initial['transcript'] = response['text'] errors = [] for result in response['results']: if not result['worked']: errors.append('%s: %s' % (result['url'], result['status'])) if errors: errors.insert(0, 'Some things could not be scraped correctly') messages.error( request, '\n'.join(errors) ) form = forms.EventTranscriptForm(instance=event, initial=initial) amara_videos = AmaraVideo.objects.filter(event=event) context['event'] = event context['amara_videos'] = amara_videos context['form'] = form context['scrapeable_urls'] = scrapeable_urls return render(request, 'manage/event_transcript.html', context)
def test_get_content_etherpad(self, rget): def mocked_get(url, **options): eq_( url, 'https://etherpad.mozilla.org/ep/pad/export/foo-bar/latest?' 'format=txt') return Response('Content here') rget.side_effect = mocked_get url = 'http://etherpad.mozilla.org/foo-bar' content, status = scraper.get_content_etherpad(url) eq_(status, 200) eq_(content, 'Content here') # or use the scrape_url() result = scraper.scrape_urls([url]) eq_(result['text'], 'Content here') eq_(result['results'][0], {'worked': True, 'status': 200, 'url': url})