Пример #1
0
def process_html(content: bytes, req_context: RequestContext) -> str:
    soup = BeautifulSoup(content, 'html.parser')
    if not soup.head:
        soup.insert(0, soup.new_tag('head'))

    normalize_url_module = soup.new_tag('script')
    normalize_url_module.string = normalizeUrlScript
    soup.head.insert(0, normalize_url_module)

    jsRequestHandler = soup.new_tag('script')
    jsRequestHandler.string = jsRequestHandlerScript
    soup.head.insert(0, jsRequestHandler)

    for searchObj in linkAttr:
        for tag in soup.findAll(**searchObj):
            for attr in searchObj:
                tag[attr] = spoof_url(tag[attr], req_context)
            if tag.name == 'img':
                if tag.get('srcset'):
                    del tag['srcset']

    for style in soup.findAll('style'):
        if style.string == None:
            style.string = ''
        style.string = process_css(style.string, req_context)

    return str(soup)
Пример #2
0
    def render(self):
        document = [self._get_global_style(), self._get_header()]

        keywords = defaultdict(lambda: defaultdict(str))
        for k, group in self.template_groups.items():
            for item in group:
                keywords.update(item.for_template())

        for k, group in self.template_groups.items():
            conclusion = []

            for item in group:
                document.append(item.template.body.format(**keywords))
                conclusion.append(item.template.conclusion)

            conclusion = ' '.join(conclusion)
            if BeautifulSoup(conclusion, 'html.parser').text:
                conclusion = BeautifulSoup(conclusion, 'html.parser')
                for p in conclusion.find_all('p'):
                    p.name = 'span'
                conclusion.insert(
                    0, BeautifulSoup(options.CONCLUSION, 'html.parser'))
                document.append(str(conclusion))

        document.append(self._get_footer())

        return ''.join(document)
Пример #3
0
    def get_svg(xml,sketchID,version):

        root = ET.fromstring(xml)
        result_soup = BeautifulSoup()
        for kobject in root.findall('.//KObject'):
            objectID = kobject.attrib['id']
            parent = kobject.find('parent')
            parentID = parent.attrib['id']
            stroke = kobject.find('strokeData')
            if stroke is not None:
                path = ksketchsvg.get_polyline(stroke)
                color = ksketchsvg.convert_color(stroke.attrib['color'])
                thickness = stroke.attrib['thickness']
                tag = ksketchsvg.createTag(objectID, path, color, thickness, kobject.attrib['centroid'])
                if parentID == "0":
                    result_soup.insert(len(result_soup.find_all('g', recursive=False)), tag)
                else:
                    grp = result_soup.find('g', {'id': parentID})
                    if grp:
                        grp.insert(len(grp.find_all('g', recursive=False)), tag)
            else:
                tag = ksketchsvg.createGroup(objectID)
                if parentID == "0":
                    result_soup.insert(len(result_soup.find_all('g', recursive=False)), tag)
                else:
                    grp = result_soup.find('g', {'id': parentID})
                    if grp:
                        grp.insert(len(grp.find_all('g', recursive=False)), tag)
        soup = BeautifulSoup()
        g_tag = Tag(soup, name='g')
        g_tag['id'] = "0"
        g_tag.insert(0, result_soup)
        SVGCache.addSVGData(sketchID,version,g_tag.prettify())
        return g_tag.prettify()
Пример #4
0
def strip_html(path, i, label_xid=True):
    """Strip the HTML: get rid of scripts and interactions"""
    print '[{}] Reading {} ...'.format(i, path)
    with open(path, 'r', 'utf8') as fin:
        # TODO: Handle encodings
        soup = BeautifulSoup(fin.read(), 'html5lib')
    # Add doctype if missing
    if not has_doctype(soup):
        soup.insert(0, Doctype('html'))
    # Remove dangerous tags
    for x in soup('script'):
        x.extract()
    for x in soup('noscript'):
        x.extract()
    for x in soup('link'):
        if x.get('as') == 'script':
            x.extract()
    for x in soup('iframe'):
        x['src'] = ''
    # Fix styles
    for x in soup('style'):
        x.string = H.unescape(u"".join(unicode(y) for y in x.contents))
    # Label all tags
    i = 1
    for x in soup.body(True):
        for attr in list(x.attrs):
            if attr.startswith('on') or attr == 'srcset':
                del x[attr]
        if label_xid:
            x['data-xid'] = i
            i += 1
    # Return
    return soup.prettify()
Пример #5
0
def build_tags_pages(articles):
    all_tags = {}
    for article in articles:
        for tag in article.tags:
            all_tags.setdefault(tag.lower(), ([], []))
            all_tags[tag.lower()][0].append(tag)
            all_tags[tag.lower()][1].append(article)

    for tag, (representations, articles) in all_tags.items():
        if len(set(representations)) > 1:
            print("WARNING: There are multiple representations for tag {}: {}".
                  format(tag, ", ".join(representations)))

        build_tag_page(representations[0], articles)

    lower_tags = list(all_tags.keys())
    lower_tags.sort(key=lambda x: (len(all_tags[x][1]), x))

    tags = BeautifulSoup('', 'html.parser')
    for i, tag in enumerate(lower_tags):
        rep = all_tags[tag][0][0]
        count = len(all_tags[tag][1])
        soup = BeautifulSoup(
            f'<li><a href="/tag/{tag}.html">{rep} ({count})</a></li>',
            'html.parser')
        tags.insert(i, soup)

    write(TAGS_PAGE.format(tags=tags.prettify()), "Tags | Layog's blog",
          f'{SRC_DIR}/tags.html')
Пример #6
0
    def response(self, flow: http.HTTPFlow):
        response = flow.response
        if CONTENT_TYPE in response.headers:
            if any(
                    map(lambda t: t in response.headers[CONTENT_TYPE],
                        RELEVANT_CONTENT_TYPES)):
                # Response is a web page; proceed.
                insertedScripts: List[str] = []
                soup = BeautifulSoup(response.content,
                                     HTML_PARSER,
                                     from_encoding=inferEncoding(response))
                requestURL = flow.request.pretty_url  # should work in transparent mode too, unless the Host header is spoofed
                isApplicable: Callable[[Userscript],
                                       bool] = userscript.applicableChecker(
                                           requestURL)
                for script in self.userscripts:
                    if isApplicable(script):
                        useInline = ctx.options.inline or script.downloadURL is None
                        if useInline and len(script.unsafeSequences) > 0:
                            logError(unsafeSequencesMessage(script))
                            continue
                        logInfo(
                            f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ..."""
                        )
                        result = inject(
                            script, soup,
                            Options(
                                inline=ctx.options.inline,
                                verbose=ctx.options.verbose,
                            ))
                        if type(result) is BeautifulSoup:
                            soup = result
                            insertedScripts.append(script.name + (
                                "" if script.version is None else " " +
                                stringifyVersion(script.version)))
                        else:
                            logError(
                                "Injection failed due to the following error:")
                            logError(str(result))

                index_DTD: Optional[int] = indexOfDTD(soup)
                # Insert information comment:
                if ctx.options.verbose:
                    soup.insert(
                        0 if index_DTD is None else 1 + index_DTD,
                        Comment(INFO_COMMENT_PREFIX +
                                ("No matching userscripts for this URL."
                                 if insertedScripts ==
                                 [] else "These scripts were inserted:\n" +
                                 bulletList(insertedScripts)) + "\n"))
                # Prevent BS/html.parser from emitting `<!DOCTYPE doctype html>` or similar if "DOCTYPE" is not all uppercase in source HTML:
                if index_DTD is not None and REGEX_DOCTYPE.match(
                        soup.contents[index_DTD]):
                    # There is a DTD and it is invalid, so replace it.
                    soup.contents[index_DTD] = Doctype(
                        re.sub(REGEX_DOCTYPE, "", soup.contents[index_DTD]))
                # Serialize and encode:
                response.content = str(soup).encode(
                    fromOptional(soup.original_encoding, CHARSET_DEFAULT),
                    "replace")
Пример #7
0
    def prep_html(self, base_url, target):
        '''
        Replace variables in the email HTML with proper values and insert the tracking image URL if needed.
        '''
        # TODO; remove placeholder IP
        base_url = 'http://10.1.2.180:8080/'
        html = self.html
        html = html.replace(b'{{ fname }}', str.encode(target.first_name))
        html = html.replace(b'{{ lname }}', str.encode(target.last_name))
        html = html.replace(
            b'{{ name }}',
            str.encode('%s %s' % (target.first_name, target.last_name)))
        html = html.replace(b'{{ url }}',
                            str.encode('%s' % target.result.tracker))
        html = html.replace(b'{{ id }}', str.encode(target.result.tracker))

        soup = BeautifulSoup(html, features='lxml')
        base = soup.new_tag('base', href=base_url)
        #soup.find('head').insert_before(base)
        soup.insert(1, base)

        if self.track:
            tracker = soup.new_tag('img',
                                   alt='',
                                   src='%s/pixel.png' %
                                   (target.result.tracker))
            soup.find('body').insert_after(tracker)
        html = str(soup).encode()

        return html
Пример #8
0
def load_task_lists():
    # initiate database connection
    db_pars_path = '/home/scube_backend/.keys/mongodb_pars.yaml'  # note: hardcoded !
    db_pars = yaml.load(open(db_pars_path))
    db, db_client = utils.connect_mongoDB_server(db_pars)

    # query scflex control databse
    ans = db_client['Scflex_control']['task_list_monitoring'].find(
        {"role": "task_list_info"})
    ans = list(ans)
    if len(ans) == 0: ""

    # get task list name
    task_list_names = [doc['db_name'] + '/' + doc['coll_name'] for doc in ans]

    # make the html
    soup = BeautifulSoup("", 'lxml')
    for name in task_list_names:
        new_tag = soup.new_tag('option')
        new_tag.attrs['value'] = name
        label = ' - '.join(name.split('/'))
        new_tag.insert(0, label)
        soup.insert(0, new_tag)
    # end for

    return str(soup)
Пример #9
0
def convert_gif_to_webm(link, file, ext, post_id):
    """
    Конвертим гифки в webm
    """
    clip = VideoFileClip(file)
    w, h = clip.size
    webm = BeautifulSoup("", "html5lib").new_tag("video")
    webm['autoplay'] = ""
    webm['loop'] = ""
    webm['controls'] = ""
    webm['style'] = "max-width: " + str(w) + "px;"
    source = BeautifulSoup("", "html5lib").new_tag("source")
    if ext == "webm":
        source['src'] = "/media" + link.group()
    else:
        file_out = uri_to_iri(
            "/root/myblog/myblog/blog/static/media/{}/{}/{}/{}-{}.webm".format(
                link.group("year"), link.group("month"), link.group("day"),
                link.group("file"), str(post_id)))
        link_out = uri_to_iri('/media/{}/{}/{}/{}-{}.webm'.format(
            link.group("year"), link.group("month"), link.group("day"),
            link.group("file"), str(post_id)))

        clip = VideoFileClip(file)
        video = CompositeVideoClip([clip])
        video.write_videofile(file_out,
                              codec='libvpx',
                              audio=False,
                              preset='superslow')

        source['src'] = link_out
    source['type'] = "video/webm"
    webm.insert(0, source)

    return webm
Пример #10
0
def set_doctype(soup: bs4.BeautifulSoup, version: str) -> None:
    if version not in DOCTYPES:
        raise ValueError('unsupported version: %s' % version)
    new_doctype = bs4.Doctype.for_name_and_ids(*DOCTYPES[version])
    for item in soup.contents:
        if isinstance(item, bs4.Doctype):
            item.replaceWith('')
    soup.insert(0, new_doctype)
Пример #11
0
    def response(self, flow: http.HTTPFlow):
        response = flow.response
        if CONTENT_TYPE in response.headers:
            if any(
                    map(lambda t: t in response.headers[CONTENT_TYPE],
                        RELEVANT_CONTENT_TYPES)):
                # Response is a web page; proceed.
                insertedScripts: List[str] = []
                soup = BeautifulSoup(response.content,
                                     HTML_PARSER,
                                     from_encoding=inferEncoding(response))
                requestURL = flow.request.pretty_url  # should work in transparent mode too, unless the Host header is spoofed
                if requestContainsQueryParam(
                        option(T.option_query_param_to_disable), flow.request):
                    logInfo(
                        f"""Not injecting any userscripts into {requestURL} because it contains a `{option(T.option_query_param_to_disable)}` query parameter."""
                    )
                    return
                isApplicable: Callable[[Userscript],
                                       bool] = userscript.applicableChecker(
                                           requestURL)
                for script in self.userscripts:
                    if isApplicable(script):
                        useInline = option(
                            T.option_inline) or script.downloadURL is None
                        if useInline and len(script.unsafeSequences) > 0:
                            logError(unsafeSequencesMessage(script))
                            continue
                        logInfo(
                            f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ..."""
                        )
                        result = inject(
                            script, soup,
                            Options(inline=option(T.option_inline), ))
                        if type(result) is BeautifulSoup:
                            soup = result
                            insertedScripts.append(script.name + (
                                "" if script.version is None else " " +
                                T.stringifyVersion(script.version)))
                        else:
                            logError(
                                "Injection failed due to the following error:")
                            logError(str(result))

                index_DTD: Optional[int] = indexOfDTD(soup)
                # Insert information comment:
                if option(T.option_list_injected):
                    soup.insert(
                        0 if index_DTD is None else 1 + index_DTD,
                        Comment(HTML_INFO_COMMENT_PREFIX +
                                ("No matching userscripts for this URL."
                                 if insertedScripts ==
                                 [] else "These scripts were inserted:\n" +
                                 bulletList(insertedScripts)) + "\n"))
                # Serialize and encode:
                response.content = str(soup).encode(
                    fromOptional(soup.original_encoding, CHARSET_DEFAULT),
                    "replace")
Пример #12
0
    def download_all_images(self):
        # download method를 이용하여 get_image_url_list method에 반환값 즉, src의 값을 url에 넣고 실행

        for url in self.get_image_url_list():
            self.download(url)

        soup = BeautifulSoup('data/{}/{}.html'.format(self.webtoon_id, self.no), 'lxml')
        tag1 = Tag(name="html")
        soup.insert(0, tag1)
Пример #13
0
    def clean_text(self):
        text = self.cleaned_data["text"]

        soup = BeautifulSoup(text, "html.parser")
        if not isinstance(soup.contents[0], Doctype):
            doctype = Doctype("html")
            soup.insert(0, doctype)

        return str(soup)
Пример #14
0
	def addTagReferences(self, dirResult, fname, tagTypeCorpus, typeCorpus, refsAfterSVM=[]): #get "listRef" to check deleted notes
		"""
		Add ignored tags from initial file
		Check the SVM classification result of reference to give <nonbibl> tag at the final construction
		Call File::buildReferences for the modification and punctuation management then print the result
		
		Parameters
		----------
		dirResult : string
			directory for output files
		fname : string
			output filename
		tagTypeCorpus :
		typeCorpus : int, {1, 2, 3}
			type of corpus
			1 : corpus 1, 2 : corpus 2...
		refsAfterSVM : list
		"""
		tmp_str = ""
		references = []
		fileRes = dirResult+fname
		for line in open (fileRes, 'r', encoding='utf8') :
			tmp_str = tmp_str + ' ' + line
		
		soup = BeautifulSoup (tmp_str)
		s = soup.findAll ("bibl")
		
		cpt = 0 #total reference count
		for fichier in self.fichiers: # Original data
			nbRefFile = fichier.nbReference(typeCorpus)
			references[:] = []
			cptRef = 0 # reference count in the file
			for ref in s:
				if cptRef < nbRefFile:
					if len(refsAfterSVM) > 0 and refsAfterSVM[cpt].train == -1 : #if the note (now tagged as <bibl>) is classified non-bibl
							for tag in (s[cpt]).findAll(True) :
								tag.replaceWith(tag.renderContents())
							s2 = BeautifulSoup() #prepare tag sets <bibl><nonbibl></nonbibl></bibl>
							tag1 = s2.new_tag("bibl")
							tag2 = s2.new_tag("nonbibl")
							s2.insert(0, tag1)
							tag1.insert(0, tag2)
							tag2.insert(0, s[cpt].renderContents()) #put the unwrapped contents in the middle of above tag sets
							references.append(s2.find("bibl")) #make s2 have found bibl
					else :
						references.append(s[cpt])
				else:
					break
				cptRef += 1
				cpt += 1
			
			'Build references in the original files and save them the root of dirResult'
			dirResultRoot = os.path.abspath(os.path.join(dirResult, os.path.pardir))+'/'
			fichier.buildReferences(references, tagTypeCorpus, dirResultRoot) #new result printing
			
		return
Пример #15
0
 def append_sender_to_message(message_plain: str, message_html: str, sender: str) -> Tuple[str, str]:
     message_plain = f"{sender}: {message_plain}"
     message_soup = BeautifulSoup(message_html, "html.parser")
     sender_name_soup = BeautifulSoup(f"<b>{sender}</b>: ", "html.parser")
     first_tag = message_soup.find()
     if first_tag.name == "p":
         first_tag.insert(0, sender_name_soup)
     else:
         message_soup.insert(0, sender_name_soup)
     return message_plain, str(message_soup)
Пример #16
0
def _inline_script(script_tag: PageElement, script_file: Path) -> bool:
    """ replacement callable to replace scripts for inline_data """

    script_content = NavigableString(script_file.read_text())

    new_script_tag = BeautifulSoup(features="html.parser").new_tag("script")
    new_script_tag.insert(0, script_content)
    new_script_tag["type"] = "text/javascript"

    script_tag.replaceWith(new_script_tag)
Пример #17
0
def _inline_css(style_tag: PageElement, style_file: Path) -> bool:
    """ replacement callable to replace stylesheets for inline_data """

    style_content = NavigableString(style_file.read_text())

    new_style_tag = BeautifulSoup(features="html.parser").new_tag("style")
    new_style_tag.insert(0, style_content)
    new_style_tag["type"] = "text/css"

    style_tag.replaceWith(new_style_tag)
Пример #18
0
 def append_sender_to_message(message_plain: str, message_html: str, sender: str) -> Tuple[str, str]:
     message_plain = "{}: {}".format(sender, message_plain)
     message_soup = BeautifulSoup(message_html, "html.parser")
     sender_name_soup = BeautifulSoup("<b>{}</b>: ".format(sender), "html.parser")
     first_tag = message_soup.find()
     if first_tag.name == "p":
         first_tag.insert(0, sender_name_soup)
     else:
         message_soup.insert(0, sender_name_soup)
     return message_plain, str(message_soup)
Пример #19
0
def add_mathjax(ast: BeautifulSoup) -> BeautifulSoup:
    src_1 = "https://polyfill.io/v3/polyfill.min.js?features=es6"
    tag_1 = ast.new_tag('script', src=src_1)
    src_2 = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"
    # <script id="MathJax-script" async src=></script>
    tag_2 = ast.new_tag('script', src=src_2, id='MathJax-script')
    tag_2.attrs['async'] = None

    ast.insert(0, tag_2)
    ast.insert(0, tag_1)
    return ast
Пример #20
0
def get_html_listing_soup(
    in_folder: Union[Path, str],
    page_title: Optional[str] = None,
    out_file: Optional[Union[Path, str]] = None,
) -> BeautifulSoup:

    in_folder = Path(in_folder)

    soup = BeautifulSoup("", "html5lib")
    cast(Tag, soup.find("html"))["lang"] = "en"

    soup.insert(0, Doctype("html"))

    if page_title is None:
        page_title = in_folder.stem

    head = cast(Tag, soup.find("head"))
    title = soup.new_tag("title")
    title.string = page_title
    head.append(title)

    body = cast(Tag, soup.find("body"))
    ul: Tag = soup.new_tag("ul")
    body.append(ul)

    now_sec = int(time.time())
    inlined_suffix_regex = re.compile(r"_inlined$")

    li: Tag
    for demo_full_path in sorted(in_folder.glob("**/*.html")):
        if demo_full_path.is_dir() or demo_full_path.name == "index.html":
            continue

        li = soup.new_tag("li")
        ul.append(li)

        demo_relative_path = urllib.parse.quote(str(
            demo_full_path.relative_to(in_folder)),
                                                safe="/")
        a = soup.new_tag(
            "a",
            href=(f"./{demo_relative_path}?t={now_sec}"),
        )

        demo_name = inlined_suffix_regex.sub("", demo_full_path.stem)
        a.string = demo_name
        li.append(a)

    if out_file is None:
        out_file = in_folder / "index.html"

    _ = Path(out_file).write_text(str(soup))

    return soup
def txt_link_downloader(html_link):

    soup = BeautifulSoup(html_link, 'html.parser')
    list_df = []
    batch = soup.find_all('td')
    counter = 0
    for index, i in enumerate(xrange(0, len(batch), 6)):
        list_df.append(map(lambda x: x.get_text(), batch[i:i + 6]))

        url_end = BeautifulSoup(batch[i + 2].encode('utf-8'),
                                'html.parser').find('a').get('href')
        url = 'http://www.the-numbers.com' + url_end
        list_df[index].append(url)

        response = urllib2.urlopen(url)
        main_doc = response.read()
        soup = BeautifulSoup(main_doc, 'html.parser')

        mpaaRating = []
        for tr in soup.findAll('tr'):
            for td in tr.findAll('td'):
                mpaaRating.append(td.get_text())
        mpaaRating = [unidecode.unidecode(x).strip() for x in mpaaRating]

        list_of_variables = [
            'Genre:', 'Running Time:', 'MPAA Rating:', 'Production Companies:',
            'Domestic Releases:', 'Domestic DVD Sales',
            'Domestic Blu-ray Sales', 'Total Domestic Video Sales',
            'Rotten Tomatoes'
        ]

        second_page = solver(list_of_variables, mpaaRating)
        list_df[index].extend(second_page)

        response = urllib2.urlopen(url)
        main_doc = response.read()
        soup = BeautifulSoup(main_doc, 'html.parser')
        soup = soup.find(text=re.compile(
            'Weekend Box Office Performance')).parent.parent.find(
                'div', attrs={"id": "box_office_chart"})
        try:
            soup = soup.get_text()
            soup = unicodedata.normalize('NFKD',
                                         soup).encode('utf-8').split()[4:35]
            soup.insert(3, 'None')
            list_df[index].extend(soup)
        except:
            pass

        counter += 1
        #sets upper limit, max is 5230 as of 10/9/2016
        if counter == 2000:
            return DataFrame(list_df)
Пример #22
0
    def prep_html(self, base_url, target, result, url):
        '''
        Replace variables in the email HTML with proper values and insert the tracking image URL if needed.
        '''
        # get result for this target in this campaign
        #result = next((x for x in target.results if x.campaign_id == campaign_id), None)
        #result = Result.query.filter_by(campaign_id=int(campaign_id), person_id=target.id).first()
        # get if campaign is using SSL
        ssl = result.campaign.ssl
        # get port the worker will host on
        port = result.campaign.port
        # get the domain name the campaign is using
        domain = result.campaign.domain.domain

        payload_url_path = result.campaign.payload_url

        # determine if base URLs are using HTTP/HTTPS and include port number in URLs for non-standard ports
        if ssl:
            if port != 443:
                base_url = f'https://{domain}:{port}'
                payload_url = f'https://{domain}:{port}{payload_url_path}?id={result.tracker}'
            else:
                base_url = f'https://{domain}'
                payload_url = f'https://{domain}{payload_url_path}?id={result.tracker}'
        else:
            if port!= 80:
                base_url = f'http://{domain}:{port}'
                payload_url = f'http://{domain}:{port}{payload_url_path}?id={result.tracker}'
            else:
                base_url = f'http://{domain}'
                payload_url = f'http://{domain}{payload_url_path}?id={result.tracker}'
        
        if url[0] != '/': url = '/' + url

        html = self.html
        if target.first_name: html = html.replace(b'{{ fname }}', str.encode(target.first_name))
        if target.last_name: html = html.replace(b'{{ lname }}', str.encode(target.last_name))
        if target.first_name and target.last_name: html = html.replace(b'{{ name }}', str.encode('%s %s' % (target.first_name, target.last_name)))
        html = html.replace(b'{{ email }}', str.encode(target.email))
        html = html.replace(b'{{ url }}', str.encode('%s%s?id=%s' % (base_url, url, result.tracker)))
        html = html.replace(b'{{ id }}', str.encode(result.tracker))
        html = html.replace(b'{{ payload_url }}', str.encode(payload_url))

        soup = BeautifulSoup(html, features='lxml')
        base = soup.new_tag('base', href=base_url)
        soup.insert(1, base)

        if self.track:
            tracker = soup.new_tag('img', alt='', src=f'{base_url}/default/{result.tracker}/logo.png')
            soup.find('body').insert_after(tracker)
        html = str(soup).encode()

        return html
Пример #23
0
def adder(fpath):
	with open(fpath) as fp:

		soup = BeautifulSoup(fp, "html.parser")
		new_child = r"{% load static %}"
		soup.insert(0, new_child)
		img = soup.find_all("img") 
		link = soup.find_all("link") # for CSS use only
		script = soup.find_all("script")

		regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
		
		
		try:
			for i in link:
				url = re.findall(regex, i.attrs["href"])
				if url == []:
					temp = i.attrs["href"]
					temp2 = r"{% static '" + temp + r"' %}"
					i.attrs["href"] = temp2
		except:
			pass

		try:
			for i in img:
				url = re.findall(regex, i.attrs["src"])
				if url == []:
					temp = i.attrs["src"]
					temp2 = r"{% static '" + temp + r"' %}"
					i.attrs["src"] = temp2
		except:
			pass

		try:
			for i in script:
				url = re.findall(regex, i.attrs["src"])
				if url == []:
					temp = i.attrs["src"]
					temp2 = r"{% static '" + temp + r"' %}"
					i.attrs["src"] = temp2
					# <script data-cfasync="false" src="../../cdn-cgi/scripts/5c5dd728/cloudflare-static/email-decode.min.js"></script>
		except:
			pass
		fp.close()	

	# with open(fpath, "w") as file:
	# 	print(str(soup))
	# 	file.write(str(soup))
	import io
	with io.open(fpath, "w", encoding="utf-8") as f:
		f.write(str(soup))
	return r'Added {% static '' %} in ' + fpath + 	' ...Done' 
Пример #24
0
def clean_summary(instance):
    if type(instance) == Article:
        summary = instance.summary
        summary = BeautifulSoup(instance.summary, 'html.parser')
        images = summary.findAll('img')
        if (len(images) > maximum_images):
            for image in images[maximum_images:]:
                image.extract()
        if len(images) < 1 and minimum_one:  #try to find one
            content = BeautifulSoup(instance.content, 'html.parser')
            first_image = content.find('img')
            if first_image:
                summary.insert(0, first_image)
        instance._summary = text_type(summary)
def clean_summary(instance):
    if type(instance) == Article:
        summary = instance.summary
        summary = BeautifulSoup(instance.summary, 'html.parser')
        images = summary.findAll('img')
        if (len(images) > maximum_images):
            for image in images[maximum_images:]:
                image.extract()
        if len(images) < 1 and minimum_one: #try to find one
            content = BeautifulSoup(instance.content, 'html.parser')
            first_image = content.find('img')
            if first_image:
                summary.insert(0, first_image)
        instance._summary = text_type(summary)
Пример #26
0
    def clean_text(self):
        text = self.cleaned_data["text"]

        soup = BeautifulSoup(text, "html.parser")
        if not isinstance(soup.contents[0], Doctype):
            doctype = Doctype("html")
            soup.insert(0, doctype)

        imgid = 0
        for img in soup.findAll("img"):
            img["id"] = "img%s" % imgid
            imgid += 1

        return str(soup)
def split_chars(data):
    soup = BeautifulSoup(data, 'html.parser')
    paths=[]
    svgs=[]
    for i in range(6):
        a=soup.find('path').extract()
        if(not a.get('fill')=='none'):
            paths.append(a)
    for path in paths:
        outer_tag = BeautifulSoup(str(soup), 'html.parser').find('svg').extract()
        outer_tag.insert(1,path)
        svgs.append(str(outer_tag))
    
    return svgs
Пример #28
0
def build_card(article):
    tags = BeautifulSoup('', 'html.parser')
    for i, tag in enumerate(article.tags):
        soup = BeautifulSoup(f'<li><a href="#">{tag}</a></li>', 'html.parser')
        tags.insert(i, soup)

    card = ARTICLE_CARD.format(
        title=article.title,
        tags=tags.prettify(),
        date=article.date.strftime('%Y-%m-%d'),
        readable_date=article.date.strftime('%B %d, %Y'),
        summary=article.summary.prettify(),
        article_link=article.path)
    return BeautifulSoup(card, 'html.parser')
Пример #29
0
def get_content_to_file( url, htmlfilename ):
    with requests.Session( ) as s:
        s.headers =  { 'Content-Type' : 'application/json',
                       'x-api-key' : _apiKey }
        response = s.get( 'https://mercury.postlight.com/parser',
                          params = { 'url' : url })
        if response.status_code != 200:
            return 'Error, no data from %s' % url
        data = response.json( )
        content = data['content']
        title = data['title']
        date_publish_string = data['date_published']
        excerpt = data['excerpt']
        url = data['url']
        html = BeautifulSoup( content, 'lxml' )
       # html = BeautifulSoup( content, 'html5lib' )
        #
        ## now all png objects to inline
        if not os.path.exists(title):
            os.mkdir(title)
            
        for img in html.find_all('img'):
            imgURL = img['src']
            split = urlparse.urlsplit(imgURL)
            filename = "./%s/"%(title) + split.path.split("/")[-1]
            urllib.urlretrieve(imgURL, filename)
            img['src'] = filename
            # if imgURL.lower( ).endswith('.png'):                
            #     img_64 = "data:image/png;base64," + base64.b64encode( urllib.urlopen( imgURL ).read( ) )
            # elif imgURL.lower( ).endswith( '.jpg' ):
            #     img_64 = "data:image/jpg;base64," + base64.b64encode( urllib.urlopen( imgURL ).read( ) )
            # else:
            #     img_64 = None
            # #
            # if img_64 is not None:
            #     img['src'] = img_64
        if not os.listdir(title) :
            os.rmdir(title)

        htag = html.new_tag( 'head' )
        mtag = html.new_tag( 'meta' )
        mtag['charset'] = 'utf-8'
        htag.append( mtag )
        html.insert(0, htag )
        if not htmlfilename :
            htmlfilename = title + '.html'
            
        with codecs.open( htmlfilename, 'w', 'utf-8') as openfile:
            openfile.write('%s\n' % html.prettify( ) )
Пример #30
0
def get_content_to_file(url, htmlfilename):
    with requests.Session() as s:
        s.headers = {'Content-Type': 'application/json', 'x-api-key': _apiKey}
        response = s.get('https://mercury.postlight.com/parser',
                         params={'url': url})
        if response.status_code != 200:
            return 'Error, no data from %s' % url
        data = response.json()
        content = data['content']
        title = data['title']
        date_publish_string = data['date_published']
        excerpt = data['excerpt']
        url = data['url']
        #        html = BeautifulSoup( content, 'lxml' )
        html = BeautifulSoup(content, 'html5lib')
        #
        ## now all png objects to inline
        if not os.path.exists(title):
            os.mkdir(title)

        for img in html.find_all('img'):
            imgURL = img['src']
            split = urlparse.urlsplit(imgURL)
            filename = "./%s/" % (title) + split.path.split("/")[-1]
            urllib.urlretrieve(imgURL, filename)
            img['src'] = filename
            # if imgURL.lower( ).endswith('.png'):
            #     img_64 = "data:image/png;base64," + base64.b64encode( urllib.urlopen( imgURL ).read( ) )
            # elif imgURL.lower( ).endswith( '.jpg' ):
            #     img_64 = "data:image/jpg;base64," + base64.b64encode( urllib.urlopen( imgURL ).read( ) )
            # else:
            #     img_64 = None
            # #
            # if img_64 is not None:
            #     img['src'] = img_64
        if not os.listdir(title):
            os.rmdir(title)

        htag = html.new_tag('head')
        mtag = html.new_tag('meta')
        mtag['charset'] = 'utf-8'
        htag.append(mtag)
        html.insert(0, htag)
        if not htmlfilename:
            htmlfilename = title + '.html'

        with codecs.open(htmlfilename, 'w', 'utf-8') as openfile:
            openfile.write('%s\n' % html.prettify())
Пример #31
0
def create_xml():
    """ Create XML. Return string (soup.prettify()). """
    soup = BeautifulSoup()

    tags = [{
        "tag": "root",
        "count": 1,
        "parent": None
    }, {
        "tag": "var",
        "count": 2,
        "parent": "root",
        "attr": {
            "name": ["id", "level"],
            "value": ["rand_str", "rand_int"]
        }
    }, {
        "tag": "objects",
        "count": 1,
        "parent": "root"
    }, {
        "tag": "object",
        "count": int(random.uniform(1, 11)),
        "parent": "objects",
        "attr": {
            "name": "rand_str"
        }
    }]

    for tag_dict in tags:
        parent = soup.find(tag_dict["parent"])
        for tag_number in range(tag_dict["count"]):
            new_tag = soup.new_tag(tag_dict["tag"])
            attr_dict = tag_dict.get("attr")
            if attr_dict is not None:
                for attr_key in attr_dict:
                    if isinstance(attr_dict[attr_key], list):
                        param = attr_dict[attr_key][tag_number]
                        new_tag[attr_key] = return_value(param)
                    else:
                        new_tag[attr_key] = return_value(attr_dict[attr_key])

            if parent is None:
                soup.insert(0, new_tag)
            else:
                parent.insert(len(parent), new_tag)

    return soup.prettify()
Пример #32
0
def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup:
    head = soup.new_tag("head")
    soup.insert(0, head)

    simplecss_link: Tag = soup.new_tag("link")
    # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
    simplecss_link["rel"] = "stylesheet"
    simplecss_link["href"] = "https://cdn.simplecss.org/simple.css"
    head.append(simplecss_link)

    # Basic style tags for compat
    style: Tag = soup.new_tag("style")
    style.append(_STYLE_TAG_CONTENT)
    head.append(style)

    return soup
Пример #33
0
    def save(self, commit=True):
        m = super(MailWithAttachmentForm, self).save(commit=False)

        soup = BeautifulSoup(m.text, "html.parser")
        if not isinstance(soup.contents[0], Doctype):
            doctype = Doctype("html")
            soup.insert(0, doctype)

        m.text = str(soup)

        m.template_type = "2"

        if commit:
            m.save()

        return m
def processSaveReport():
    print('Processing and saving report')
    cwd = os.getcwd()
    folder = 'templates'
    filename = 'svreport.html'
    svreportLocation = cwd + os.sep + folder + os.sep + filename

    try:
        soup = BeautifulSoup(open(svreportLocation), features="html.parser")
        print(f'Successfully parsed report at location: {svreportLocation}')
    except:
        print(f"Couldn't parse report at location: {svreportLocation}")

    navUpdateString = """
        //Sets nav-link to active for this page.
        $(document).ready(function () {
          $('#datagraphs').addClass('active');
        });
    """
    insertTopString = "{% extends 'base.html' %}\n{% block content %}"
    insertBottomString = "{% endblock %}"

    #Remove doctype:
    for item in soup.contents:
        if isinstance(item, Doctype):
            item.extract()
    #end remove doctype

    #Remove <html></html> tags
    for match in soup.findAll('html'):
        match.unwrap()

    #Remove <link> tags
    soup.find('link').extract()

    soup.insert(0, insertTopString)
    soup.insert(len(soup) + 1, insertBottomString)

    #script insert
    scriptTag = soup.new_tag('script')
    scriptTag.append(navUpdateString)
    #soup.body.insert(len(soup.body.contents), navUpdateString)
    head = soup.find('head')
    head.insert(1, scriptTag)

    #print(soup.prettify())
    writeFile(svreportLocation, soup)
    def _create_new_soup_with_div_container(self, doctype_element, head_element, title_string):
        container_div = "<div class='container'></div>"
        body_soup = BeautifulSoup(str(container_div), "html5lib")
        body_soup = self._insert_google_adsense(body_soup)

        # Insert document type
        body_soup.insert(0, doctype_element)

        # Insert head element
        title = body_soup.new_tag('title')
        title.string = title_string
        head_clone = self._create_clone(head_element)
        head_clone.append(title)
        new_first_page_head = body_soup.head
        new_first_page_head.replace_with(head_clone)

        return body_soup
Пример #36
0
def clean_summary(instance):
    if type(instance) == Article:
        summary = BeautifulSoup(instance.summary, 'html.parser')

        if clean_target_tag:
            # remove specific type of tags with class
            remove_targets = summary.findAll(clean_target_tag, attrs={"class": clean_target_class})
            for target in remove_targets:
                # print(target)
                target.extract()

        images = summary.findAll('img')
        if (len(images) > maximum_images):
            for image in images[maximum_images:]:
                image.extract()
        if len(images) < 1 and minimum_one: #try to find one
            content = BeautifulSoup(instance.content, 'html.parser')
            first_image = content.find('img')
            if first_image:
                summary.insert(0, first_image)
        instance._summary = text_type(summary)
Пример #37
0
def prettyprint_email_as_html(email_json):

    soup = BeautifulSoup()
    html = soup.new_tag('html')
    head = soup.new_tag('head')
    title = soup.new_tag('title')
    html.append(head)
    head.append(title)
    title.append(email_json["id"])

    body = soup.new_tag('body')
    html.append(body)

    body = soup.new_tag('body')
    soup.insert(0, body)

    body.append("ID: {0}".format(email_json["id"]))
    body.append("From: {0}".format(email_json["senders_line"][0]))
    body.append(soup.new_tag('br'))

    body.append("From: {0}".format(email_json["senders_line"][0]))
    body.append(soup.new_tag('br'))
    body.append("To: {0}".format(email_json["tos_line"][0]))
    body.append(soup.new_tag('br'))
    if email_json["ccs_line"]:
        body.append("Cc: {0}".format(email_json["ccs_line"][0]))
        body.append(soup.new_tag('br'))

    body.append("Sent: {0}".format(email_json["datetime"]))
    body.append(soup.new_tag('br'))

    body.append("Subject: {0}".format(email_json["subject"]))
    body.append(soup.new_tag('br'))

    pre = soup.new_tag('pre')
    pre.append(email_json["body"])
    body.append(pre)

    return soup.prettify()
Пример #38
0
def build_dumb_bonita_error_body(exception='',code='',message=''):
    # Add your own Bonita java Exception in this dict to make your call shorter
    # So you can call with exception='UserNotFoundException'
    # rather than exception = 'org.ow2.bonita.facade.exception.UserNotFoundException'
    java_exception_dict = {'UserNotFoundException':'org.ow2.bonita.facade.exception.UserNotFoundException',
                           'ProcessNotFoundException':'org.ow2.bonita.facade.exception.ProcessNotFoundException',
                           'GroupNotFoundException':'org.ow2.bonita.facade.exception.GroupNotFoundException',
                           'RoleNotFoundException':'org.ow2.bonita.facade.exception.RoleNotFoundException'}
    exception_text = java_exception_dict.get(exception,exception)

    # Build XML body
    soup = BeautifulSoup('','xml')
    tag_exception = soup.new_tag(exception_text)
    tag_code = soup.new_tag('errorCode')
    tag_message = soup.new_tag('detailMessage')

    tag_code.string = code
    tag_message.string = message

    soup.insert(0,tag_exception)
    tag_exception.insert(0,tag_code)
    tag_exception.insert(1,tag_message)

    return unicode(soup)
soup =  BeautifulSoup(features='html5lib')

# create tags
tag1 = soup.new_tag("person")
tag2 = soup.new_tag("name")
tag3 = soup.new_tag("location")

# add attributes
tag2['first'] = 'John'
tag2['last'] = 'Smith'
tag3['country'] = 'uk'

# add text
text = NavigableString("John Gary Smith")

# build soup
soup.insert(0, tag1)
tag1.insert(0, tag2)
tag1.insert(1, tag3)
tag2.insert(0, text)

print(soup)
print("----------------")
print(soup.prettify())


1
    


Пример #40
0
def update_html_for_static(book, html_content, epub=False):
    soup = BeautifulSoup(html_content, 'lxml-html')

    # remove encoding as we're saving to UTF8 anyway
    encoding_specified = False
    for meta in soup.findAll('meta'):
        if 'charset' in meta.attrs:
            encoding_specified = True
            # logger.debug("found <meta> tag with charset `{}`"
            #              .format(meta.attrs.get('charset')))
            del(meta.attrs['charset'])
        elif 'content' in meta.attrs \
                and 'charset=' in meta.attrs.get('content'):
            try:
                ctype, ccharset = meta.attrs.get('content').split(';', 1)
            except:
                continue
            else:
                encoding_specified = True
            # logger.debug("found <meta> tag with content;charset `{}`"
            #              .format(meta.attrs.get('content')))
            meta.attrs['content'] = ctype
    if encoding_specified:
        # logger.debug("charset was found and removed")
        pass

    # update all <img> links from images/xxx.xxx to {id}_xxx.xxx
    if not epub:
        for img in soup.findAll('img'):
            if 'src' in img.attrs:
                img.attrs['src'] = img.attrs['src'].replace(
                    'images/', '{id}_'.format(id=book.id))

    # update all <a> links to internal HTML pages
    # should only apply to relative URLs to HTML files.
    # examples on #16816, #22889, #30021
    def replacablement_link(book, url):
        try:
            urlp, anchor = url.rsplit('#', 1)
        except ValueError:
            urlp = url
            anchor = None
        if '/' in urlp:
            return None

        if len(urlp.strip()):
            nurl = "{id}_{url}".format(id=book.id, url=urlp)
        else:
            nurl = ""

        if anchor is not None:
            return "#".join([nurl, anchor])

        return nurl

    if not epub:
        for link in soup.findAll('a'):
            new_link = replacablement_link(
                book=book, url=link.attrs.get('href', ''))
            if new_link is not None:
                link.attrs['href'] = new_link

    # Add the title
    if not epub:
        soup.title.string = book.title

    patterns = [
        ("*** START OF THE PROJECT GUTENBERG EBOOK",
         "*** END OF THE PROJECT GUTENBERG EBOOK"),

        ("***START OF THE PROJECT GUTENBERG EBOOK",
         "***END OF THE PROJECT GUTENBERG EBOOK"),

        ("<><><><><><><><><><><><><><><><><><><><><><><><><><><><>"
         "<><><><><><>",
         "<><><><><><><><><><><><><><><><><><><><><><><><><><><><>"
         "<><><><><><>"),

        # ePub only
        ("*** START OF THIS PROJECT GUTENBERG EBOOK",
         "*** START: FULL LICENSE ***"),
        ("*END THE SMALL PRINT! FOR PUBLIC DOMAIN ETEXT",
         "——————————————————————————-"),

        ("*** START OF THIS PROJECT GUTENBERG EBOOK",
         "*** END OF THIS PROJECT GUTENBERG EBOOK"),

        ("***START OF THE PROJECT GUTENBERG",
         "***END OF THE PROJECT GUTENBERG EBOOK"),

        ("COPYRIGHT PROTECTED ETEXTS*END*",
         "==========================================================="),

        ("Nous remercions la Bibliothèque Nationale de France qui a mis à",
         "The Project Gutenberg Etext of"),
        ("Nous remercions la Bibliothèque Nationale de France qui a mis à",
         "End of The Project Gutenberg EBook"),

        ("=========================================================="
         "===============",
         "——————————————————————————-"),

        ("Project Gutenberg Etext", "End of Project Gutenberg Etext"),

        ("Text encoding is iso-8859-1", "Fin de Project Gutenberg Etext"),

        ("—————————————————-", "Encode an ISO 8859/1 "
         "Etext into LaTeX or HTML"),
    ]

    body = soup.find('body')
    try:
        is_encapsulated_in_div = sum(
            [1 for e in body.children
             if not isinstance(e, bs4.NavigableString)]) == 1
    except:
        is_encapsulated_in_div = False

    if is_encapsulated_in_div and not epub:
        DEBUG_COUNT.append((book.id, book.title))

    if not is_encapsulated_in_div:
        for start_of_text, end_of_text in patterns:
            if start_of_text not in body.text and end_of_text not in body.text:
                continue

            if start_of_text in body.text and end_of_text in body.text:
                remove = True
                for child in body.children:
                    if isinstance(child, bs4.NavigableString):
                        continue
                    if end_of_text in getattr(child, 'text', ''):
                        remove = True
                    if start_of_text in getattr(child, 'text', ''):
                        child.decompose()
                        remove = False
                    if remove:
                        child.decompose()
                break

            elif start_of_text in body.text:
                # logger.debug("FOUND START: {}".format(start_of_text))
                remove = True
                for child in body.children:
                    if isinstance(child, bs4.NavigableString):
                        continue
                    if start_of_text in getattr(child, 'text', ''):
                        child.decompose()
                        remove = False
                    if remove:
                        child.decompose()
                break
            elif end_of_text in body.text:
                # logger.debug("FOUND END: {}".format(end_of_text))
                remove = False
                for child in body.children:
                    if isinstance(child, bs4.NavigableString):
                        continue
                    if end_of_text in getattr(child, 'text', ''):
                        remove = True
                    if remove:
                        child.decompose()
                break

    # build infobox
    if not epub:
        infobox = jinja_env.get_template('book_infobox.html')
        infobox_html = infobox.render({'book': book})
        info_soup = BeautifulSoup(infobox_html, 'lxml-html')
        body.insert(0, info_soup.find('div'))

    # if there is no charset, set it to utf8
    if not epub:
        meta = BeautifulSoup('<meta http-equiv="Content-Type" '
                             'content="text/html; charset=UTF-8" />',
                             'lxml-html')
        head = soup.find('head')
        html = soup.find('html')
        if head:
            head.insert(0, meta.head.contents[0])
        elif html:
            html.insert(0, meta.head)
        else:
            soup.insert(0, meta.head)

        return html

    return soup
browser.open(base + "/news")

link = browser.get_link(text="UK")
browser.open(base + link['href'])
soup = browser.parsed

# pick out anchors that are tagged with the story class
# tags = soup.findAll("a", "story")
tags = soup.findAll("a")
newSoup = BeautifulSoup(features="html5lib")

for tag in tags:
    # add base url if it is missing from href
    if tag['href'][0] == "/": tag['href'] = base + tag['href']
    # add tag to new soup followed by a <br>  
    newSoup.insert(0, tag)
    br = soup.new_tag("br")
    newSoup.insert(0, br)

# convert soup into a string
data = str(newSoup)

# save scraped html to a file
try:
    f = open("out.html", "w",  encoding="UTF-8")
    f.write(data)
    f.close()
except IOError as e:
    print(e)

# display local file in browser
Пример #42
0
        <img src="./output/wc.png" alt="" height="250" width="250" />
	<div id=hashtag">
    		<h5>Hashtag Count</h5>
    	</div>
        <img src="./output/hashtag.png" alt="" height="250" width="250" />
    </div>
<table>
</table>
</body>
</html>
'''

'''Inserts the URL sentiment into the HTML tag <table>'''
soup = BeautifulSoup(doc,'html.parser')
body = soup.new_tag('body')
soup.insert(0, body)
table = soup.new_tag('table')
body.insert(0, table)


with open("./output/url_sentiment.txt") as infile:
    for line in infile:
        row = soup.new_tag('tr')
        col1, col2, col3 = line.split()
        for coltext in (col3, col2, col1): # important that you reverse order
            col = soup.new_tag('td')
            col.string = coltext
            row.insert(0, col)
        table.insert(len(table.contents), row)

with open('sentiment.html', 'w') as outfile: