예제 #1
0
    def testWebTextExtractor(self):
        text = get_web_text('http://google.com')
        self.assertTrue('google' in text)

        # Bad url should raise an exception
        with self.assertRaises(subprocess.CalledProcessError):
            get_web_text('weeeeeeeeeeeeeeeeeeeeeee')
예제 #2
0
    def testWebTextExtractor(self):
        text = get_web_text('http://google.com')
        self.assertTrue('google' in text)

        # Bad url should raise an exception
        with self.assertRaises(subprocess.CalledProcessError):
            get_web_text('weeeeeeeeeeeeeeeeeeeeeee')
예제 #3
0
def web_content_extraction(sample_id, url=None, *args, **kwargs):
    """ Links/lynx required. Generates html output from those browsers.
    """
    if url is None:
        url = Sample.objects.get(id=sample_id).url

    if not is_proper_url(url):
        return False

    sample = Sample.objects.get(id=sample_id)

    try:
        text = get_web_text(url)

        Sample.objects.filter(id=sample_id).update(text=text)
        send_event(
            "EventSampleContentDone",
            sample_id=sample_id,
            sample_url=sample.url,
            job_id=sample.job_id,
        )
    except subprocess.CalledProcessError, e:
        # Something wrong has happened to links. Couldn't find documentation on
        # error codes - assume bad stuff has happened that retrying won't fix.
        send_event(
            'EventSampleContentFail',
            sample_id=sample_id,
            sample_url=sample.url,
            job_id=sample.job_id,
            error_code=e.returncode
        )
        return False