예제 #1
0
 def test_full_url(self):
     print ''
     gen = webpage2html.generate('./another_dir/test_full_url.html',
                                 comment=False,
                                 full_url=True)
     assert 'href="another_dir/questions/110240"' in gen or 'href="./another_dir/questions/110240"' in gen, gen
     assert 'href="another_dir/static/img/favicon.ico"' in gen or 'href="./another_dir/static/img/favicon.ico"' in gen, gen
예제 #2
0
 def test_web_font(self):
     print ''
     gen = webpage2html.generate('./webfont.html',
                                 comment=False,
                                 full_url=True)
     # FIXME: do not cover all web fonts with hash postfix
     assert 'application/x-font-ttf' in gen, gen
def physicsaviary_download(laboratory_id):
    laboratories, identifiers = get_laboratories()
    lab_data = identifiers.get(laboratory_id)
    if not lab_data:
        return "Not found", 404

    link = lab_data['link']
    generated = webpage2html.generate(index=link, keep_script=True, verbose=False)
    return generated.encode()
예제 #4
0
def saveHTML(url, img=True, prefix=''):
    if img:
        text = generate(url)
    else:
        response = requests.get(url)
        text = response.text
    doc = Document(text)
    content = fixHeader(doc.summary())

    filename = os.path.join(prefix, doc.title() + '.html')
    with open(filename, 'w') as f:
        f.write(content)
    return filename
예제 #5
0
 def local_test(self, index):
     print ''
     gen = webpage2html.generate(index, comment = False).encode('utf8').strip()
     ans = open(index[:-5] + '_single.html', 'rb').read().strip()
     gl = len(gen)
     al = len(ans)
     begin = 0
     while begin < gl and begin < al and ans[begin] == gen[begin]:
         begin += 1
     end = -1
     while end + gl > 0 and end + al > 0 and ans[end] == gen[end] :
         end -= 1
     self.assertEqual(gen, ans, 'Test Fail for %s, begin = %d, end = %d, ans len = %d, gen len = %d, ans = %s\ngen = %s\n' % (index, begin, end, al, gl, repr(ans[begin: end]), repr(gen[begin: end])))
예제 #6
0
def do_classification_job(classification_job_id):
    """Given a classification job object, download the
    page in the background and classify the content."""
    classification_job = ClassificationJob.objects.get(
        id=classification_job_id)
    url = classification_job.classified_page.url
    html_content = classification_job.classified_page.content  # load he content, may be None
    classifier = pickle.loads(
        classification_job.classifier_used.serialized)  # load classifier

    try:
        if html_content is None:
            # download page first if not in DB
            logger.info('Downloading webpage')
            html_content = webpage2html.generate(url)

            # update the downloaded content
            # we at least want to keep the HTML
            with transaction.atomic():
                # save the content
                logger.info('Webpage downloaded')
                classification_job.classified_page.content = html_content
                classification_job.classified_page.save()
        else:
            logger.info('Webpage in DB. Skipping download')

        # try to classify the html content
        extractor = HTMLExtractor(classifier)  # get the extractor
        paths = extractor.extract_from_html(html_content)

        # create a list to bulk create later
        result_list = [
            ClassificationResult(job=classification_job, xpath=path)
            for path in paths
        ]

        with transaction.atomic():
            # save the classification result
            ClassificationResult.objects.bulk_create(result_list)

            # and specify success if it reaches this point
            # we want to either set it all as a success or none
            classification_job.is_failed = False
            classification_job.date_ended = timezone.now()
            classification_job.save()
    except Exception as exce:
        # end the job as a failure
        classification_job.is_failed = True
        classification_job.date_ended = timezone.now()
        classification_job.save()
예제 #7
0
 def test_none(self):
     print ''
     self.assertEqual(webpage2html.generate('non-existing-file.html', comment=False, verbose=False), '')
예제 #8
0
 def test_web_font(self):
     print ''
     gen = webpage2html.generate('./webfont.html', comment=False, full_url=True)
     # FIXME: do not cover all web fonts with hash postfix
     assert 'application/x-font-ttf' in gen, gen
예제 #9
0
 def test_full_url(self):
     print ''
     gen = webpage2html.generate('./another_dir/test_full_url.html', comment=False, full_url=True)
     assert 'href="another_dir/questions/110240"' in gen, gen
     assert 'href="another_dir/static/img/favicon.ico"' in gen, gen
예제 #10
0
 def test_no_script(self):
     print ''
     gen = webpage2html.generate('./test_no_script.html', comment=False, keep_script=False)
     assert '<script' not in gen, gen
예제 #11
0
 def test_0ops(self):
     print ''
     gen = webpage2html.generate('./hacklu-ctf-2013-exp400-wannable-0ops.html', comment=False)
     assert '<style data-href="./hacklu-ctf-2013-exp400-wannable-0ops_files/screen.css" type="text/css">html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt' in gen
예제 #12
0
 def test_no_script(self):
     print ''
     gen = webpage2html.generate('./test_no_script.html',
                                 comment=False,
                                 keep_script=False)
     assert '<script' not in gen, gen
예제 #13
0
 def test_requests_page(self):
     print('')
     gen = webpage2html.generate('./test_requests_page.html', comment=False, full_url=True)
     assert '<div id="searchbox" role="search" style="display: none">' in gen
예제 #14
0
 def test_text_css(self):
     print('')
     gen = webpage2html.generate('./text_css.html', comment=False, full_url=True)
     assert '<style data-href="./text_css/style.css" rel="stylesheet" type="text/css">@import url(data:text/css;base64,Cmh' in gen
예제 #15
0
import pickle
from learnhtml.extractor import HTMLExtractor
import webpage2html
from lxml import etree

with open('models/model-recipe.pkl', 'rb') as f:
    model = pickle.load(f)

extractor = HTMLExtractor(model)

url = 'https://trinesmatblogg.no/recipe/kremet-kyllingform-med-paprika'
html = webpage2html.generate(
    url, verbose=False
)  # webpage2html downloads all dependencies of the page as well
paths = extractor.extract_from_html(html)

print("extracted paths", paths)

root = etree.HTML(html.encode('utf-8'))

extracted_html = []
for path in paths:
    elements = root.xpath(path)
    for element in elements:
        extracted_html.append(
            etree.tostring(element, encoding='unicode', pretty_print=True))

with open('test.html', 'w') as f:
    for item in extracted_html:
        f.write("%s\n" % item)
예제 #16
0
 def test_pre_formatting(self):
     print ''
     gen = webpage2html.generate('./test_pre_formatting.html',
                                 comment=False)
     assert '<pre><code>$ git clone https://github.com/chaitin/sqlchop</code></pre>' in gen
예제 #17
0
 def test_0ops(self):
     print ''
     gen = webpage2html.generate(
         './hacklu-ctf-2013-exp400-wannable-0ops.html', comment=False)
     assert '<style data-href="./hacklu-ctf-2013-exp400-wannable-0ops_files/screen.css" type="text/css">html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt' in gen
예제 #18
0
 def test_link_media(self):
     print ''
     gen = webpage2html.generate('./test_css_screen.html',
                                 comment=False,
                                 full_url=True)
     assert 'media="screen"' in gen
예제 #19
0
 def test_none(self):
     print ''
     self.assertEqual(
         webpage2html.generate('non-existing-file.html',
                               comment=False,
                               verbose=False), '')
예제 #20
0
 def test_pre_formatting(self):
     print ''
     gen = webpage2html.generate('./test_pre_formatting.html', comment=False)
     assert '<pre><code>$ git clone https://github.com/chaitin/sqlchop</code></pre>' in gen