def testWebTextExtractor(self): text = get_web_text('http://google.com') self.assertTrue('google' in text) # Bad url should raise an exception with self.assertRaises(subprocess.CalledProcessError): get_web_text('weeeeeeeeeeeeeeeeeeeeeee')
def web_content_extraction(sample_id, url=None, *args, **kwargs): """ Links/lynx required. Generates html output from those browsers. """ if url is None: url = Sample.objects.get(id=sample_id).url if not is_proper_url(url): return False sample = Sample.objects.get(id=sample_id) try: text = get_web_text(url) Sample.objects.filter(id=sample_id).update(text=text) send_event( "EventSampleContentDone", sample_id=sample_id, sample_url=sample.url, job_id=sample.job_id, ) except subprocess.CalledProcessError, e: # Something wrong has happened to links. Couldn't find documentation on # error codes - assume bad stuff has happened that retrying won't fix. send_event( 'EventSampleContentFail', sample_id=sample_id, sample_url=sample.url, job_id=sample.job_id, error_code=e.returncode ) return False