def test_full_url(self): print '' gen = webpage2html.generate('./another_dir/test_full_url.html', comment=False, full_url=True) assert 'href="another_dir/questions/110240"' in gen or 'href="./another_dir/questions/110240"' in gen, gen assert 'href="another_dir/static/img/favicon.ico"' in gen or 'href="./another_dir/static/img/favicon.ico"' in gen, gen
def test_web_font(self): print '' gen = webpage2html.generate('./webfont.html', comment=False, full_url=True) # FIXME: do not cover all web fonts with hash postfix assert 'application/x-font-ttf' in gen, gen
def physicsaviary_download(laboratory_id): laboratories, identifiers = get_laboratories() lab_data = identifiers.get(laboratory_id) if not lab_data: return "Not found", 404 link = lab_data['link'] generated = webpage2html.generate(index=link, keep_script=True, verbose=False) return generated.encode()
def saveHTML(url, img=True, prefix=''): if img: text = generate(url) else: response = requests.get(url) text = response.text doc = Document(text) content = fixHeader(doc.summary()) filename = os.path.join(prefix, doc.title() + '.html') with open(filename, 'w') as f: f.write(content) return filename
def local_test(self, index): print '' gen = webpage2html.generate(index, comment = False).encode('utf8').strip() ans = open(index[:-5] + '_single.html', 'rb').read().strip() gl = len(gen) al = len(ans) begin = 0 while begin < gl and begin < al and ans[begin] == gen[begin]: begin += 1 end = -1 while end + gl > 0 and end + al > 0 and ans[end] == gen[end] : end -= 1 self.assertEqual(gen, ans, 'Test Fail for %s, begin = %d, end = %d, ans len = %d, gen len = %d, ans = %s\ngen = %s\n' % (index, begin, end, al, gl, repr(ans[begin: end]), repr(gen[begin: end])))
def do_classification_job(classification_job_id): """Given a classification job object, download the page in the background and classify the content.""" classification_job = ClassificationJob.objects.get( id=classification_job_id) url = classification_job.classified_page.url html_content = classification_job.classified_page.content # load he content, may be None classifier = pickle.loads( classification_job.classifier_used.serialized) # load classifier try: if html_content is None: # download page first if not in DB logger.info('Downloading webpage') html_content = webpage2html.generate(url) # update the downloaded content # we at least want to keep the HTML with transaction.atomic(): # save the content logger.info('Webpage downloaded') classification_job.classified_page.content = html_content classification_job.classified_page.save() else: logger.info('Webpage in DB. Skipping download') # try to classify the html content extractor = HTMLExtractor(classifier) # get the extractor paths = extractor.extract_from_html(html_content) # create a list to bulk create later result_list = [ ClassificationResult(job=classification_job, xpath=path) for path in paths ] with transaction.atomic(): # save the classification result ClassificationResult.objects.bulk_create(result_list) # and specify success if it reaches this point # we want to either set it all as a success or none classification_job.is_failed = False classification_job.date_ended = timezone.now() classification_job.save() except Exception as exce: # end the job as a failure classification_job.is_failed = True classification_job.date_ended = timezone.now() classification_job.save()
def test_none(self): print '' self.assertEqual(webpage2html.generate('non-existing-file.html', comment=False, verbose=False), '')
def test_full_url(self): print '' gen = webpage2html.generate('./another_dir/test_full_url.html', comment=False, full_url=True) assert 'href="another_dir/questions/110240"' in gen, gen assert 'href="another_dir/static/img/favicon.ico"' in gen, gen
def test_no_script(self): print '' gen = webpage2html.generate('./test_no_script.html', comment=False, keep_script=False) assert '<script' not in gen, gen
def test_0ops(self): print '' gen = webpage2html.generate('./hacklu-ctf-2013-exp400-wannable-0ops.html', comment=False) assert '<style data-href="./hacklu-ctf-2013-exp400-wannable-0ops_files/screen.css" type="text/css">html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt' in gen
def test_requests_page(self): print('') gen = webpage2html.generate('./test_requests_page.html', comment=False, full_url=True) assert '<div id="searchbox" role="search" style="display: none">' in gen
def test_text_css(self): print('') gen = webpage2html.generate('./text_css.html', comment=False, full_url=True) assert '<style data-href="./text_css/style.css" rel="stylesheet" type="text/css">@import url(data:text/css;base64,Cmh' in gen
import pickle from learnhtml.extractor import HTMLExtractor import webpage2html from lxml import etree with open('models/model-recipe.pkl', 'rb') as f: model = pickle.load(f) extractor = HTMLExtractor(model) url = 'https://trinesmatblogg.no/recipe/kremet-kyllingform-med-paprika' html = webpage2html.generate( url, verbose=False ) # webpage2html downloads all dependencies of the page as well paths = extractor.extract_from_html(html) print("extracted paths", paths) root = etree.HTML(html.encode('utf-8')) extracted_html = [] for path in paths: elements = root.xpath(path) for element in elements: extracted_html.append( etree.tostring(element, encoding='unicode', pretty_print=True)) with open('test.html', 'w') as f: for item in extracted_html: f.write("%s\n" % item)
def test_pre_formatting(self): print '' gen = webpage2html.generate('./test_pre_formatting.html', comment=False) assert '<pre><code>$ git clone https://github.com/chaitin/sqlchop</code></pre>' in gen
def test_0ops(self): print '' gen = webpage2html.generate( './hacklu-ctf-2013-exp400-wannable-0ops.html', comment=False) assert '<style data-href="./hacklu-ctf-2013-exp400-wannable-0ops_files/screen.css" type="text/css">html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt' in gen
def test_link_media(self): print '' gen = webpage2html.generate('./test_css_screen.html', comment=False, full_url=True) assert 'media="screen"' in gen
def test_none(self): print '' self.assertEqual( webpage2html.generate('non-existing-file.html', comment=False, verbose=False), '')