Python convert_page 예제들, htmlark.convert_page Python 예제들

예제 #1

0

파일 보기

def toHeader(source, dest, fileName):
    destFile = os.path.join(dest, fileName)
    #print(source)
    packed_html = htmlark.convert_page(source, ignore_errors=True)
    with gzip.open(destFile, "wt") as f:
        f.write(packed_html)

    #zipped = zlib.compress(packed_html.encode('utf-8'))
    zipped = open(destFile, 'rb').read()
    #os.remove(destFile)
    #print(zipped)
    destFile = destFile + ".h"
    safeName = fileName.replace('.', '_')

    with open(destFile, "wt") as f:
        f.write('#define ' + safeName + '_len ' + str(len(zipped)) + '\n')
        f.write('const uint8_t ' + safeName + '[] PROGMEM = {')

        for counter, b in enumerate(zipped):
            if counter % 20 == 0:
                f.write("\n")
            f.write(hex(b))
            if counter < len(zipped) - 1:
                f.write(",")

        f.write('\n};')
        f.close()

예제 #2

0

파일 보기

파일: mainsimple.py 프로젝트: Kobtul/diploma-profiling-experiments

def create_profiles_visualization(profiles):
    import htmlark
    with open('static/ip_profiles/js/profile.js', 'w') as fp:
        fp.write('var profile =')
        json.dump(profiles, fp, default=dumper)
        fp.write(';')
    packed_html = htmlark.convert_page('static/only_profiles.html', ignore_errors=True)
    with open(args.output + args.visualize_profile, 'wb') as fp:
        fp.write(packed_html.encode("utf-8", "replace"))
    # webbrowser.open('file://' + os.path.realpath('only_profiles.html'))
    print('Visualization is saved in '+args.output+' '+ args.visualize_profile)

예제 #3

0

파일 보기

파일: helpers.py 프로젝트: MikeHibbert/arkive

def create_archive_page(url):
    save_file = os.path.join(settings.BASE_DIR, 'pages',
                             "{}.html".format(arrow.now().timestamp))
    add_og_tags_to_page_at(url, save_file)

    packed_html = htmlark.convert_page(save_file, ignore_errors=True)

    with open(save_file, 'w') as sf:
        sf.write(packed_html)

    return save_file

예제 #4

0

파일 보기

파일: mainsimple.py 프로젝트: Kobtul/diploma-profiling-experiments

def create_comparisonsjs(df_results_dict):
    import htmlark
    with open('static/comparisons/js/comparisons.js', 'w') as fp:
        fp.write('var comparisons =')
        json.dump(df_results_dict, fp, default=dumper)
        fp.write(';')
        # webbrowser.open('file://' + os.path.realpath('only_profiles.html'))
    packed_html = htmlark.convert_page('static/comparisons.html', ignore_errors=True)
    with open(args.output + args.visualize_comparisons, 'wb') as fp:
        fp.write(packed_html.encode("utf-8", "replace"))
    print('Profile comparisons visualization is saved in '+args.output+' ' + args.visualize_comparisons)

예제 #5

0

파일 보기

def singlesitestatic():
    try:
        urllib.request.urlretrieve(args.url, "static_source.html")
        sleep(5)

        inlinedhtml = htmlark.convert_page("static_source.html",
                                           ignore_errors=True)

        # remove whitespace
        inlinedhtml = ' '.join(inlinedhtml.split())

        # make all links unclickable
        soup = BeautifulSoup(inlinedhtml, features="html.parser")
        for a in range(len(soup('a'))):
            soup('a')[a]["onclick"] = "return false;"
        for p in range(len(soup('input'))):
            soup('input')[p]["onclick"] = "return false;"
        for b in range(len(soup('button'))):
            soup('input')[b]["onclick"] = "return false;"

        # remove script tags
        for script in soup("script"):
            soup.script.extract()

        # remove all iframe tags
        for iframe in soup("iframe"):
            soup.iframe.extract()

        # remove .ico requests
        rels = soup.findAll("link", attrs={"rel": "shortcut icon"})
        for rel in rels:
            rel.extract()

        irels = soup.findAll("link", attrs={"rel": "icon"})
        for irel in irels:
            irel.extract()

        if args.links:
            for a in soup.findAll('a'):
                a['href'] = str(args.links)
            print(str(args.links))

        # output sandboxed html
        with open("contained_static.html", "w") as file:
            file.write(str(soup))
        file.close()
        print("static html of {} successfully containerized".format(args.url))
    except:
        print("containerization of static html from {} failed".format(
            args.url))
        pass

예제 #6

0

파일 보기

import htmlark

packed_html = htmlark.convert_page('https://www.bbc.co.uk/news/world-africa-51063149', ignore_errors=True)

f = open('htmlark_test.html', 'w')
f.write(packed_html)
f.close()

예제 #7

0

파일 보기

def csvparsejs(target):
    cols = target.columns.values
    for idx, line in enumerate(target[cols[1]]):
        if line != None:
            try:
                print("containerizing {}".format(line))
                print("containerizing js rendered html")
                webdriver_path = ''  #Replace with chrome webdriver path
                chrome_options = Options()
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--window-size=1920x1080')
                browser = webdriver.Chrome(executable_path=webdriver_path,
                                           options=chrome_options)
                browser.get(line)
                pagesource = browser.page_source.encode('utf-8')
                sleep(10)
                html = open("{}_source_js.html".format(
                    str(target[cols[0]][idx])),
                            "w",
                            encoding="utf-8")
                html.write(str(pagesource))
                html.close()

                sleep(5)
                inlinedhtml = htmlark.convert_page("{}_source_js.html".format(
                    str(target[cols[0]][idx])),
                                                   ignore_errors=True)

                #remove whitespace
                inlinedhtml = ' '.join(inlinedhtml.split())

                #make all links unclickable
                soup = BeautifulSoup(inlinedhtml, features="html.parser")
                for a in range(len(soup('a'))):
                    soup('a')[a]["onclick"] = "return false;"
                for p in range(len(soup('input'))):
                    soup('input')[p]["onclick"] = "return false;"
                for b in range(len(soup('button'))):
                    soup('input')[b]["onclick"] = "return false;"

                #remove script tags
                for script in soup("script"):
                    soup.script.extract()

                #remove all iframe tags
                for iframe in soup("iframe"):
                    soup.iframe.extract()

                # remove .ico requests
                rels = soup.findAll("link", attrs={"rel": "shortcut icon"})
                for rel in rels:
                    rel.extract()

                irels = soup.findAll("link", attrs={"rel": "icon"})
                for irel in irels:
                    irel.extract()

                #replace links
                if target[cols[2]][idx] != None:
                    for a in soup.findAll('a'):
                        a['href'] = str(target[cols[2]][idx])
                        print(str(target[cols[2]][idx]))

                #output sandboxed html
                with open(
                        "{}_contained_js.html".format(str(
                            target[cols[0]][idx])), "w") as file:
                    file.write(str(soup))
                file.close()
                print(
                    "js rendered html of {} successfully containerized".format(
                        line))
            except:
                print(
                    "{} containerization process failed, continuing to next webpage"
                    .format(line))
                continue
    return 0

예제 #8

0

파일 보기

def singlesitejs():
    print("containerizing {}".format(args.url))
    try:
        print("containerizing js rendered html")
        # collect html
        webdriver_path = ''  #Replace with chrome webdriver path
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--window-size=1920x1080')
        browser = webdriver.Chrome(executable_path=webdriver_path,
                                   options=chrome_options)
        browser.get(args.url)
        pagesource = browser.page_source.encode('utf-8')
        sleep(10)
        html = open("js_source.html", "w", encoding="utf-8")
        html.write(str(pagesource))
        html.close()

        sleep(5)
        inlinedhtml = htmlark.convert_page("js_source.html",
                                           ignore_errors=True)

        # remove whitespace
        inlinedhtml = ' '.join(inlinedhtml.split())

        # make all links unclickable
        soup = BeautifulSoup(inlinedhtml, features="html.parser")
        for a in range(len(soup('a'))):
            soup('a')[a]["onclick"] = "return false;"
        for p in range(len(soup('input'))):
            soup('input')[p]["onclick"] = "return false;"
        for b in range(len(soup('button'))):
            soup('input')[b]["onclick"] = "return false;"

        # remove script tags
        for script in soup("script"):
            soup.script.extract()

        # remove all iframe tags
        for iframe in soup("iframe"):
            soup.iframe.extract()

        # remove .ico requests
        rels = soup.findAll("link", attrs={"rel": "shortcut icon"})
        for rel in rels:
            rel.extract()

        irels = soup.findAll("link", attrs={"rel": "icon"})
        for irel in irels:
            irel.extract()

        #replace links
        if args.links:
            for a in soup.findAll('a'):
                a['href'] = str(args.links)
            print(str(args.links))

        # output sandboxed html
        with open("contained_js.html", "w") as file:
            file.write(str(soup))
        file.close()
        print("js rendered html of {} successfully containerized".format(
            args.url))
    except:
        print("containerization of js rendered html from {} failed".format(
            args.url))
        pass

예제 #9

0

파일 보기

def csvparsestatic(target2):
    cols = target2.columns.values
    for idx2, line2 in enumerate(target2[cols[1]]):
        if line2 != None:
            try:
                urllib.request.urlretrieve(
                    line2,
                    "{}_source.html".format(str(target2[cols[0]][idx2])))
                sleep(5)
                inlinedhtml = htmlark.convert_page("{}_source.html".format(
                    str(target2[cols[0]][idx2])),
                                                   ignore_errors=True)

                # remove whitespace
                inlinedhtml = ' '.join(inlinedhtml.split())

                # make all links unclickable
                soup = BeautifulSoup(inlinedhtml, features="html.parser")
                for a in range(len(soup('a'))):
                    soup('a')[a]["onclick"] = "return false;"
                for p in range(len(soup('input'))):
                    soup('input')[p]["onclick"] = "return false;"
                for b in range(len(soup('button'))):
                    soup('input')[b]["onclick"] = "return false;"

                # remove script tags
                for script in soup("script"):
                    soup.script.extract()

                # remove all iframe tags
                for iframe in soup("iframe"):
                    soup.iframe.extract()

                # remove .ico requests
                rels = soup.findAll("link", attrs={"rel": "shortcut icon"})
                for rel in rels:
                    rel.extract()

                irels = soup.findAll("link", attrs={"rel": "icon"})
                for irel in irels:
                    irel.extract()

                #replace links
                if target2[cols[2]][idx2] != None:
                    for a in soup.findAll('a'):
                        a['href'] = str(target2[cols[2]][idx2])
                        print(str(target2[cols[2]][idx2]))

                # output sandboxed html
                with open(
                        "{}_contained_static.html".format(
                            str(target2[cols[0]][idx2])), "w") as file:
                    file.write(str(soup))
                file.close()
                print(
                    "{} successfully containerized static html".format(line2))
            except:
                print(
                    "{} containerization process failed, continuing to next webpage"
                    .format(line2))
                continue

예제 #10

0

파일 보기

        soup.head.append(tag)

    return str(soup)


def add_og_tags_to_page_at(url, save_file="og_webpage.html"):
    _, html = htmlark._get_resource(url)

    add_og_tage_to_page(html, save_file)


def add_og_tage_to_page(html, save_file):
    html = add_all_og_tags(html)

    with open(save_file, 'w') as sf:
        sf.write(html)

        sf.close()


if __name__ == "__main__":
    import arrow

    save_file = "og_webpage-{}.html".format(arrow.now().timestamp)
    add_og_tags_to_page_at('https://www.bbc.co.uk/news/world-africa-51063149',
                           save_file)

    packed_html = htmlark.convert_page(save_file, ignore_errors=True)

    with open(save_file, 'w') as sf:
        sf.write(packed_html)