def toHeader(source, dest, fileName): destFile = os.path.join(dest, fileName) #print(source) packed_html = htmlark.convert_page(source, ignore_errors=True) with gzip.open(destFile, "wt") as f: f.write(packed_html) #zipped = zlib.compress(packed_html.encode('utf-8')) zipped = open(destFile, 'rb').read() #os.remove(destFile) #print(zipped) destFile = destFile + ".h" safeName = fileName.replace('.', '_') with open(destFile, "wt") as f: f.write('#define ' + safeName + '_len ' + str(len(zipped)) + '\n') f.write('const uint8_t ' + safeName + '[] PROGMEM = {') for counter, b in enumerate(zipped): if counter % 20 == 0: f.write("\n") f.write(hex(b)) if counter < len(zipped) - 1: f.write(",") f.write('\n};') f.close()
def create_profiles_visualization(profiles): import htmlark with open('static/ip_profiles/js/profile.js', 'w') as fp: fp.write('var profile =') json.dump(profiles, fp, default=dumper) fp.write(';') packed_html = htmlark.convert_page('static/only_profiles.html', ignore_errors=True) with open(args.output + args.visualize_profile, 'wb') as fp: fp.write(packed_html.encode("utf-8", "replace")) # webbrowser.open('file://' + os.path.realpath('only_profiles.html')) print('Visualization is saved in '+args.output+' '+ args.visualize_profile)
def create_archive_page(url): save_file = os.path.join(settings.BASE_DIR, 'pages', "{}.html".format(arrow.now().timestamp)) add_og_tags_to_page_at(url, save_file) packed_html = htmlark.convert_page(save_file, ignore_errors=True) with open(save_file, 'w') as sf: sf.write(packed_html) return save_file
def create_comparisonsjs(df_results_dict): import htmlark with open('static/comparisons/js/comparisons.js', 'w') as fp: fp.write('var comparisons =') json.dump(df_results_dict, fp, default=dumper) fp.write(';') # webbrowser.open('file://' + os.path.realpath('only_profiles.html')) packed_html = htmlark.convert_page('static/comparisons.html', ignore_errors=True) with open(args.output + args.visualize_comparisons, 'wb') as fp: fp.write(packed_html.encode("utf-8", "replace")) print('Profile comparisons visualization is saved in '+args.output+' ' + args.visualize_comparisons)
def singlesitestatic(): try: urllib.request.urlretrieve(args.url, "static_source.html") sleep(5) inlinedhtml = htmlark.convert_page("static_source.html", ignore_errors=True) # remove whitespace inlinedhtml = ' '.join(inlinedhtml.split()) # make all links unclickable soup = BeautifulSoup(inlinedhtml, features="html.parser") for a in range(len(soup('a'))): soup('a')[a]["onclick"] = "return false;" for p in range(len(soup('input'))): soup('input')[p]["onclick"] = "return false;" for b in range(len(soup('button'))): soup('input')[b]["onclick"] = "return false;" # remove script tags for script in soup("script"): soup.script.extract() # remove all iframe tags for iframe in soup("iframe"): soup.iframe.extract() # remove .ico requests rels = soup.findAll("link", attrs={"rel": "shortcut icon"}) for rel in rels: rel.extract() irels = soup.findAll("link", attrs={"rel": "icon"}) for irel in irels: irel.extract() if args.links: for a in soup.findAll('a'): a['href'] = str(args.links) print(str(args.links)) # output sandboxed html with open("contained_static.html", "w") as file: file.write(str(soup)) file.close() print("static html of {} successfully containerized".format(args.url)) except: print("containerization of static html from {} failed".format( args.url)) pass
import htmlark packed_html = htmlark.convert_page('https://www.bbc.co.uk/news/world-africa-51063149', ignore_errors=True) f = open('htmlark_test.html', 'w') f.write(packed_html) f.close()
def csvparsejs(target): cols = target.columns.values for idx, line in enumerate(target[cols[1]]): if line != None: try: print("containerizing {}".format(line)) print("containerizing js rendered html") webdriver_path = '' #Replace with chrome webdriver path chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--window-size=1920x1080') browser = webdriver.Chrome(executable_path=webdriver_path, options=chrome_options) browser.get(line) pagesource = browser.page_source.encode('utf-8') sleep(10) html = open("{}_source_js.html".format( str(target[cols[0]][idx])), "w", encoding="utf-8") html.write(str(pagesource)) html.close() sleep(5) inlinedhtml = htmlark.convert_page("{}_source_js.html".format( str(target[cols[0]][idx])), ignore_errors=True) #remove whitespace inlinedhtml = ' '.join(inlinedhtml.split()) #make all links unclickable soup = BeautifulSoup(inlinedhtml, features="html.parser") for a in range(len(soup('a'))): soup('a')[a]["onclick"] = "return false;" for p in range(len(soup('input'))): soup('input')[p]["onclick"] = "return false;" for b in range(len(soup('button'))): soup('input')[b]["onclick"] = "return false;" #remove script tags for script in soup("script"): soup.script.extract() #remove all iframe tags for iframe in soup("iframe"): soup.iframe.extract() # remove .ico requests rels = soup.findAll("link", attrs={"rel": "shortcut icon"}) for rel in rels: rel.extract() irels = soup.findAll("link", attrs={"rel": "icon"}) for irel in irels: irel.extract() #replace links if target[cols[2]][idx] != None: for a in soup.findAll('a'): a['href'] = str(target[cols[2]][idx]) print(str(target[cols[2]][idx])) #output sandboxed html with open( "{}_contained_js.html".format(str( target[cols[0]][idx])), "w") as file: file.write(str(soup)) file.close() print( "js rendered html of {} successfully containerized".format( line)) except: print( "{} containerization process failed, continuing to next webpage" .format(line)) continue return 0
def singlesitejs(): print("containerizing {}".format(args.url)) try: print("containerizing js rendered html") # collect html webdriver_path = '' #Replace with chrome webdriver path chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--window-size=1920x1080') browser = webdriver.Chrome(executable_path=webdriver_path, options=chrome_options) browser.get(args.url) pagesource = browser.page_source.encode('utf-8') sleep(10) html = open("js_source.html", "w", encoding="utf-8") html.write(str(pagesource)) html.close() sleep(5) inlinedhtml = htmlark.convert_page("js_source.html", ignore_errors=True) # remove whitespace inlinedhtml = ' '.join(inlinedhtml.split()) # make all links unclickable soup = BeautifulSoup(inlinedhtml, features="html.parser") for a in range(len(soup('a'))): soup('a')[a]["onclick"] = "return false;" for p in range(len(soup('input'))): soup('input')[p]["onclick"] = "return false;" for b in range(len(soup('button'))): soup('input')[b]["onclick"] = "return false;" # remove script tags for script in soup("script"): soup.script.extract() # remove all iframe tags for iframe in soup("iframe"): soup.iframe.extract() # remove .ico requests rels = soup.findAll("link", attrs={"rel": "shortcut icon"}) for rel in rels: rel.extract() irels = soup.findAll("link", attrs={"rel": "icon"}) for irel in irels: irel.extract() #replace links if args.links: for a in soup.findAll('a'): a['href'] = str(args.links) print(str(args.links)) # output sandboxed html with open("contained_js.html", "w") as file: file.write(str(soup)) file.close() print("js rendered html of {} successfully containerized".format( args.url)) except: print("containerization of js rendered html from {} failed".format( args.url)) pass
def csvparsestatic(target2): cols = target2.columns.values for idx2, line2 in enumerate(target2[cols[1]]): if line2 != None: try: urllib.request.urlretrieve( line2, "{}_source.html".format(str(target2[cols[0]][idx2]))) sleep(5) inlinedhtml = htmlark.convert_page("{}_source.html".format( str(target2[cols[0]][idx2])), ignore_errors=True) # remove whitespace inlinedhtml = ' '.join(inlinedhtml.split()) # make all links unclickable soup = BeautifulSoup(inlinedhtml, features="html.parser") for a in range(len(soup('a'))): soup('a')[a]["onclick"] = "return false;" for p in range(len(soup('input'))): soup('input')[p]["onclick"] = "return false;" for b in range(len(soup('button'))): soup('input')[b]["onclick"] = "return false;" # remove script tags for script in soup("script"): soup.script.extract() # remove all iframe tags for iframe in soup("iframe"): soup.iframe.extract() # remove .ico requests rels = soup.findAll("link", attrs={"rel": "shortcut icon"}) for rel in rels: rel.extract() irels = soup.findAll("link", attrs={"rel": "icon"}) for irel in irels: irel.extract() #replace links if target2[cols[2]][idx2] != None: for a in soup.findAll('a'): a['href'] = str(target2[cols[2]][idx2]) print(str(target2[cols[2]][idx2])) # output sandboxed html with open( "{}_contained_static.html".format( str(target2[cols[0]][idx2])), "w") as file: file.write(str(soup)) file.close() print( "{} successfully containerized static html".format(line2)) except: print( "{} containerization process failed, continuing to next webpage" .format(line2)) continue
soup.head.append(tag) return str(soup) def add_og_tags_to_page_at(url, save_file="og_webpage.html"): _, html = htmlark._get_resource(url) add_og_tage_to_page(html, save_file) def add_og_tage_to_page(html, save_file): html = add_all_og_tags(html) with open(save_file, 'w') as sf: sf.write(html) sf.close() if __name__ == "__main__": import arrow save_file = "og_webpage-{}.html".format(arrow.now().timestamp) add_og_tags_to_page_at('https://www.bbc.co.uk/news/world-africa-51063149', save_file) packed_html = htmlark.convert_page(save_file, ignore_errors=True) with open(save_file, 'w') as sf: sf.write(packed_html)