Пример #1
0
Файл: toc.py Проект: sbadia/site
def generate_toc(content):
    if isinstance(content, contents.Static):
        return

    _toc_run = content.metadata.get(
            'toc_run',
            content.settings['TOC']['TOC_RUN'])
    if not _toc_run == 'true':
        return

    all_ids = set()
    title = content.metadata.get('title', 'Title')
    tree = node = HtmlTreeNode(None, '', 'h0', '')
    soup = BeautifulSoup(content._content, 'html.parser')
    settoc = False

    try:
        header_re = re.compile(content.metadata.get(
            'toc_headers', content.settings['TOC']['TOC_HEADERS']))
    except re.error as e:
        logger.error("TOC_HEADERS '%s' is not a valid re\n%s",
                     content.settings['TOC']['TOC_HEADERS'])
        raise e

    for header in soup.findAll(header_re):
        settoc = True
        node, new_header = node.add(header, all_ids)
        header.replaceWith(new_header)  # to get our ids back into soup

    if (settoc):
        tree_string = '{}'.format(tree)
        tree_soup = BeautifulSoup(tree_string, 'html.parser')
        content.toc = tree_soup.decode(formatter='html')
    content._content = soup.decode(formatter='html')
Пример #2
0
def markdown_figure(instance):
    """Wraps img in figure tags, adds figcation and if BOOTSTRAPPER_FIGURES is set adds bootstrap attributes to figure tags."""

    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content, 'html.parser')
        defaults = []
        default_attribute_key = 'class'
        figure_default = [{'figure': {'class': ['figure']}, 'figcaption': {'class': ['figure-caption']}, 'figure > img': {'class': ['figure-img']}}]
        is_default_set = lambda setting: True if setting in instance.settings and instance.settings[setting] else False

        if is_default_set('MDF_DEFAULT_CONFIG'):
            defaults.extend([instance.settings['MDF_DEFAULT_CONFIG']])

        if is_default_set('BOOTSTRAPPER_FIGURES'):
            defaults.extend(figure_default)

        figurify(soup)
        for default in defaults:
            for selector, value in default.iteritems():
                if isinstance(value, dict):
                    for attribute_key, attribute_value in default[selector].iteritems():
                        replace_in_with(soup, selector, attribute_key, attribute_value)

                else:
                    replace_in_with(soup, selector, default_attribute_key, value)

        instance._content = soup.decode(formatter="html")
Пример #3
0
def convert_summary(input):
    try:
        soup = BeautifulSoup(input, "html.parser")
        value = soup.decode(formatter="minimal")
    except HTMLParseError:
        return input
    return value
def content_object_init(instance):

	if instance._content is not None:
		content = instance._content
		# use Python's built-in parser so no duplicated html & body tags appear, or use tag.unwrap()
		text = BeautifulSoup(content, "html.parser")
		
		if 'a' in content:
			for link in text.find_all(href=re.compile("(.+?)>")):
				url = link.get('href')
				m = re.search(r"(.+?)>", url).groups()
				name = m[0]
				if name in interlinks:
					hi = url.replace(name+">",interlinks[name])
					link['href'] = hi
		if 'img' in content:
			for img in text.find_all('img', src=re.compile("(.+?)>")):
				url = img.get('src')
				m = re.search(r"(.+?)>", url).groups()
				name = m[0]
				if name in interlinks:
					hi = url.replace(name+">",interlinks[name])
					img['src'] = hi

		instance._content = text.decode()
Пример #5
0
def remove_footnotes(content):
    '''
    Strip footnote reference links from 'content'
    '''
    if content is None:
        return None

    soup = BeautifulSoup(content, "lxml")
    soup.html.unwrap()
    soup.body.unwrap()

    for ref_footnote in soup.findAll("a", class_="footnote-reference"):
        # Remove the footnote-reference link and the space preceding it

        # Example:
        # <a href="...">Actual link</a> <a class="footnote-reference">[1]</a>

        # Access previous element in tree.
        # This should be a 'NavigableString' with a single space.
        prev = ref_footnote.previous_sibling
        if prev.string != " ":
            raise Exception(
                "Unexpected HTML surrounding summary footnote reference!")

        # If that went well, remove the space and the footnote tag.
        prev.replace_with("")  # can't remove, replace with empty string
        ref_footnote.decompose()  # remove and deconstruct

    return soup.decode()
def extract_toc(content):
    if isinstance(content, contents.Static):
        return

    soup = BeautifulSoup(content._content,'html.parser')
    filename = content.source_path
    extension = path.splitext(filename)[1][1:]
    toc = None
    # if it is a Markdown file
    if extension in readers.MarkdownReader.file_extensions:
        toc = soup.find('div', class_='toc')
        if toc: toc.extract()
    # else if it is a reST file
    elif extension in readers.RstReader.file_extensions:
        toc = soup.find('div', class_='contents topic')
        if toc: toc.extract()
        if toc:
            tag=BeautifulSoup(str(toc))
            tag.div['class']='toc'
            tag.div['id']=''
            p=tag.find('p', class_='topic-title first')
            if p:p.extract()
            toc=tag
    elif not toc:  # Pandoc reader
        toc = soup.find('nav', id='TOC')
    if toc:
        toc.extract()
        content._content = soup.decode()
        content.toc = toc.decode()
Пример #7
0
    def ENMLtoText(contentENML):

        soup = BeautifulSoup(contentENML.decode('utf-8'))

        # In ENML, each line in paragraph have <div> tag.
        for section in soup.find_all('div'):
            if not section.br:
                section.append(soup.new_tag("br"))
            section.unwrap()

        for section in soup.select('li > p'):
            section.replace_with( section.contents[0] )

        for section in soup.select('li > br'):
            if section.next_sibling:
                next_sibling = section.next_sibling.next_sibling
                if next_sibling:
                    if next_sibling.find('li'):
                        section.extract()
                else:
                    section.extract()

        h2t = html2text.HTML2Text()
        h2t.body_width = 0
        content = h2t.handle(soup.decode())
        content = re.sub(r' *\n', os.linesep, content)
        content = content.replace(unichr(160), " ")
        return content.encode('utf-8')
Пример #8
0
def nbsp_footnotes(content):
    '''
    Replace space between link and footnote with nbsp.
    '''
    if content is None:
        return None

    soup = BeautifulSoup(content, "lxml")
    soup.html.unwrap()
    soup.body.unwrap()

    for ref_footnote in soup.findAll("a", class_="footnote-reference"):
        # Example:
        # <a href="...">Actual link</a> <a class="footnote-reference">[1]</a>

        # Access previous element in tree.
        # This should be a 'NavigableString' with a single space.
        prev = ref_footnote.previous_sibling
        if prev.string != " ":
            raise Exception(
                "Unexpected HTML surrounding summary footnote reference!")

        prev.replace_with(u'\xa0')  # U+00A0 = nbsp

    return soup.decode()
Пример #9
0
    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
        builder = self.default_builder
        obj = BeautifulSoup(to_parse, builder=builder)
        if compare_parsed_to is None:
            compare_parsed_to = to_parse

        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
Пример #10
0
def cleanse(html_source, pretty_print=False, formatter='html'):
    if html_source is None:
        return ''
    
    # convert to unicode
    html_unicode = UnicodeDammit(html_source).unicode_markup
    
    # strip classes
    html_sans_classes = match_classes.subn(r"<\1\2>", html_unicode)[0]
    
    # strip artifacts
    html_sans_artifacts = match_artifacts.subn(r"", html_sans_classes)[0]
    
    # insert <hr> tags at section breaks
    html_with_section_breaks = match_section_breaks.subn(
        r'<hr class="section-break" />\n',
        p_corrector.subn(r'<p></p>', html_sans_artifacts)[0])[0]
    
    # strip metatags from header
    soup_sans_metatags = BeautifulSoup(html_with_section_breaks)
    ex_ = [meta.replace_with(u'') for meta in soup_sans_metatags.findAll(name='meta')]
    del ex_
    
    # decode most html entities
    html_sans_most_entities = soup_sans_metatags.decode(
        pretty_print=pretty_print,
        formatter=formatter)
    
    # consolidate multiple empty lines
    html_clean = match_multilines.subn(r"\n\n", html_sans_most_entities)[0]
    
    # return unicode
    return html_clean
Пример #11
0
def wrap_image_tags(p):
    """ Wrap image tags in links to add Lightbox support

    Any image tag in the content with class={LBPREFIX}-{SETNAME} will be
    wrapped with an anchored href with Lightbox support.  `LBPREFIX` is defined
    in the settings file as `LIGHTBOX_PREFIX` with a default of `'lb-'`.

    :param p: pelican instance
    :return: None
    """

    lbprefix = p.settings.get('LIGHTBOX_PREFIX', 'lb-')
    lbset = p.settings.get('LIGHTBOX_SET', 'images')

    if p._content is not None:
        content = p._content
        soup = BeautifulSoup(content)

        # Wrap each image tag in an anchor with a link.  Add the
        # attribute for the lightbox set to activate.
        if 'img' in content:
            for tag in soup('img'):

                # Skip if no class tag
                if not tag.has_attr('class'):
                    continue

                for c in tag['class']:
                    c.split(lbprefix)
                    substr = c.split(lbprefix,1)

                    # If the first element of the split is empty then the prefix
                    # is at the start of the string c.  We also must check that
                    # c is not empty.
                    if c and not substr[0]:
                        if substr[1]:
                            gallery = substr[1]
                        else:
                            gallery = lbgallery

                        link_wrapper = soup.new_tag("a", href=tag['src'])
                        link_wrapper['data-lightbox'] = substr[1] # We have to add data-lightbox seperately b/c it fails in the above as a seperate expression (- is a minus sign)

                        # Set the title (ie: lightbox caption) to the alt-text
                        if tag.has_attr('alt'):
                            link_wrapper['title'] = tag['alt']

                        # Set the title attribute as a caption, if the image is
                        # wrapped in a figure
                        fig = tag.find_parent('div', 'figure')
                        if fig:
                            caption = fig.findChild('p', 'caption')
                            if caption:
                                link_wrapper['title'] = caption.get_text()

                        tag.wrap(link_wrapper)

                        break # So we only use the first class specified

            p._content = soup.decode()
Пример #12
0
def performOPFSourceUpdates(data, currentdir, keylist, valuelist):
    # rebuild serialized lookup dictionary
    updates = {}
    for i in range(0, len(keylist)):
        updates[ keylist[i] ] = valuelist[i]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data, features=None, builder=xmlbuilder)
    for tag in soup.find_all(["item","reference","site"]):
        if "href" in tag.attrs :
            href = tag["href"]
            if href.find(":") == -1 :
                parts = href.split('#')
                url = parts[0]
                fragment = ""
                if len(parts) > 1:
                    fragment = parts[1]
                bookrelpath = os.path.join(currentdir, unquoteurl(url))
                bookrelpath = os.path.normpath(bookrelpath)
                bookrelpath = bookrelpath.replace(os.sep, "/")
                if bookrelpath in updates:
                    attribute_value = updates[bookrelpath]
                    if fragment != "":
                        attribute_value = attribute_value + "#" + fragment
                    attribute_value = quoteurl(attribute_value)
                    tag["href"] = attribute_value
    newdata = soup.decode(pretty_print=True, formatter='minimal')
    return newdata
Пример #13
0
def grab_and_parse_results(target_url):
  http_response = urllib.request.urlopen(target_url)
  html_soup = BeautifulSoup(http_response) # Grab the HTML from the given url address
  # Pull out only the lines that match the pattern given by the regualar expression below
  pattern = u'[\u3030-\u9FAF][\u3030-\u9FAF0-9A-Za-z\u3001\u2026 ]+[\u300D|\u3002|\uFF01|\uFF1F|\u2026]+'
  # Return a list based on a set of unique sentences only (removes duplicates)
  return list(re.findall(pattern, html_soup.decode('utf-8-sig')))
Пример #14
0
    def run(self, text):
        soup = BeautifulSoup(text, 'html.parser')
        new_soup = BeautifulSoup()

        content = new_soup.new_tag('div', **{'class': self.content_class})

        for tag in soup.children:
            if isinstance(tag, NavigableString):
                continue

            if tag.name not in self.incut_tags and len(tag.contents) == 1 and tag.contents[0].name in self.incut_tags:
                tag = tag.contents[0]

            if tag.name in self.incut_tags:
                if len(content):
                    new_soup.append(content)
                    content = new_soup.new_tag('div', **{'class': self.content_class})

                klass = self.incut_class
                if tag.name == 'iframe':
                    klass += ' ' + self.incut_video_class

                incut = soup.new_tag('div', **{'class': klass})
                incut.append(tag)
                new_soup.append(incut)
            else:
                content.append(tag)

        if len(content):
            new_soup.append(content)

        return new_soup.decode()
Пример #15
0
def fix_urls(document, base_url):
    soup = Soup(document)
    for tag in soup('a'):
        if tag['href'].startswith('/'):
            tag['href'] = base_url + tag['href']

    return soup.decode()
def content_object_init(instance):

    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content)

        if "img" in content:
            for img in soup("img"):
                # TODO: Pretty sure this isn't the right way to do this, too hard coded.
                # There must be a setting that I should be using?
                src = instance.settings["PATH"] + "/images/" + os.path.split(img["src"])[1]
                im = Image.open(src)
                extra_style = "width: {}px; height: auto;".format(im.size[0])

                if instance.settings["RESPONSIVE_IMAGES"]:
                    extra_style += " max-width: 100%;"

                if img.get("style"):
                    img["style"] += extra_style
                else:
                    img["style"] = extra_style

                if img["alt"] == img["src"]:
                    img["alt"] = ""

                fig = img.find_parent("div", "figure")
                if fig:
                    if fig.get("style"):
                        fig["style"] += extra_style
                    else:
                        fig["style"] = extra_style

        instance._content = soup.decode()
Пример #17
0
def harvest_images_in_fragment(fragment, settings):
    fragment_changed = False
    soup = BeautifulSoup(fragment)

    for img in soup.find_all('img', class_=re.compile("image-process-[-a-zA-Z0-9_]+")):
        for c in img['class']:
            match = re.search(r"image-process-([-a-zA-Z0-9_]+)", c)
            if match is not None:
                derivative = match.group(1)

                if derivative not in settings['IMAGE_PROCESS']:
                    raise RuntimeError('Derivative %s undefined.' % (derivative,))

                if isinstance(settings['IMAGE_PROCESS'][derivative], dict) and \
                        'type' not in settings['IMAGE_PROCESS'][derivative]:
                    raise RuntimeError('"type" is mandatory for %s.' % derivative)

                if isinstance(settings['IMAGE_PROCESS'][derivative], list) or \
                        (isinstance(settings['IMAGE_PROCESS'][derivative], dict) and \
                             settings['IMAGE_PROCESS'][derivative]['type'] == 'image'):

                    # Single source image specification.
                    process_img_tag(img, settings, derivative)
                    fragment_changed = True

                elif isinstance(settings['IMAGE_PROCESS'][derivative], dict) and \
                        settings['IMAGE_PROCESS'][derivative]['type'] == 'responsive-image':

                    # srcset image specification.
                    build_srcset(img, settings, derivative)
                    fragment_changed = True

                elif isinstance(settings['IMAGE_PROCESS'][derivative], dict) and \
                        settings['IMAGE_PROCESS'][derivative]['type'] == 'picture':

                    # Multiple source (picture) specification.
                    group = img.find_parent()
                    if group.name == 'div':
                        convert_div_to_picture_tag(soup, img, group, settings, derivative)
                    elif group.name == 'picture':
                        process_picture(soup, img, group, settings, derivative)
                    fragment_changed = True

                break # for c in img['class']

    if fragment_changed:
        # In Python 2, BeautifulSoup put our fragment inside html and
        # body tags, but in Python 3, it does not (maybe because it is
        # not using the same HTML parser).
        body = soup.find('body')
        if body:
            new_fragment = '';
            for element in body.children:
                new_fragment += element.decode()
        else:
            new_fragment = soup.decode()
    else:
        new_fragment = fragment

    return new_fragment
Пример #18
0
def content_object_init(instance):

    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content)

        if 'img' in content:
            for img in soup('img'):
                logger.debug('PATH: %s', instance.settings['PATH'])
                logger.debug('img.src: %s', img['src'])

                img_path, img_filename = path.split(img['src'])

                logger.debug('img_path: %s', img_path)
                logger.debug('img_fname: %s', img_filename)

                lightbox_style = 'image' # All images on the same page are combined into a set

                fig = img.find_parent('a')
                if fig:
                    if not(fig.get('data-lightbox')):
                        fig['data-lightbox'] = lightbox_style
                        fig['data-title'] = img_filename

        instance._content = soup.decode()
Пример #19
0
def process_summary(article):
    """
    Ensures summaries are not cut off.
    Also inserts mathjax script so that math will be rendered.
    """

    summary = article.summary
    summary_parsed = BeautifulSoup(summary, 'html.parser')
    math = summary_parsed.find_all(class_='math')

    if len(math) > 0:
        last_math_text = math[-1].get_text()
        if len(last_math_text) > 3 and last_math_text[-3:] == '...':
            content_parsed = BeautifulSoup(article._content, 'html.parser')
            full_text = content_parsed.find_all(
                class_='math'
            )[len(math)-1].get_text()
            math[-1].string = "%s ..." % full_text
            summary = summary_parsed.decode()
        # clear memoization cache
        import functools
        if isinstance(article.get_summary, functools.partial):
            memoize_instance = article.get_summary.func.__self__
            memoize_instance.cache.clear()

        article._summary = (
            "{}<script type='text/javascript'>"
            "{}"
            "</script>"
        ).format(
            summary,
            process_summary.mathjax_script
        )
def parse_images(instance):
    if instance._content is None or 'img' not in instance._content:
        return

    content = instance._content[:]
    soup = BeautifulSoup(content, "html.parser")

    for img in soup('img'):
        # Build the source image filename
        my_url2path_func = instance.settings['MY_IMG_URL2PATH_FUNC']
        if not my_url2path_func:
            logger.error('Error: MY_IMG_URL2PATH_FUNC not defined in your pelican configuration.\n\
                    niux2_lazyload_helper cannot determine the image path from its url.\n')
            return
        imgPath, new_src = my_url2path_func(img['src'])

        if not new_src.startswith('http') and not (path.isfile(imgPath) and access(imgPath, R_OK)):
            logger.error('Error: image file not found: {}'.format(imgPath))
            continue

        img['src'] = new_src
        # Open the source image and query dimensions
        if new_src.startswith('http'):
            img_data = urlopen(new_src).read()
            fid = TemporaryFile('wb+')
            fid.write(img_data)
            fid.flush()
            fid.seek(0)
        else:
            fid = open(imgPath, 'rb')
        im = Image.open(fid)
        imgWidth = im.size[0]
        imgHeight = im.size[1]
        imgResized = False

        if not img.get('width'):
            img['width'] = str(imgWidth) + 'px'
        else:
            imgResized = True

        # for lazyload.js
        if instance.settings.get('NIUX2_LAZY_LOAD', False):
            if img.get('class'):
                img['class'] += 'lazy'
            else:
                img['class'] = 'lazy'
            img['data-original'] = img['src']
            del img['src']
            if imgResized:
                newImgWidth = int(_width_attr_reg.sub('', img['width']).strip())
                newImgHeight = imgHeight * newImgWidth / imgWidth
                img['data-width'] = str(newImgWidth) + 'px'
                img['data-height'] = str(newImgHeight) + 'px'
            else:
                img['data-width'] = str(imgWidth) + 'px'
                img['data-height'] = str(imgHeight) + 'px'

    instance._content = soup.decode()
Пример #21
0
def bootstrapify(content):
    if isinstance(content, contents.Static):
        return

    soup = BeautifulSoup(content._content)
    replace_tables(soup)
    replace_images(soup)

    content._content = soup.decode()
Пример #22
0
def rename_html_img_links(html_input, basename):
    """Rename all ``<img>`` tag ``src`` attributes based on `basename`.

    Each `src` of each ``<img>`` tag in `html_input` is renamed to a
    new location of form ``<BASENAME>_<NUM>.<EXT>`` where
    ``<BASENAME>`` is the basename of `basename`, ``<NUM>`` a unique
    number starting with 1 (one) and ``<EXT>`` the filename extension
    of the original ``src`` file.

    For example:

    ``<img src="foo_m1234.jpeg">``

    with a `basename` ``sample.html`` will be replaced by

    ``<img src="sample_1.jpeg">``

    if this is the first ``<img>`` tag in the document.

    Returns a tuple ``<HTML_OUTPUT>, <NAME_MAP>`` where
    ``<HTML_OUTPUT>`` is the modified HTML and ``<NAME_MAP>`` is a
    dictionary with a mapping from old filenames to new ones. The
    latter can be used to rename any real files (which is not done by
    this function).

    Links to 'external' sources (http and similar) are ignored.

    This funtion expects text as input and returns text, not bytes.
    I.e. you will get unicode snippets under Python 2.x and text
    (or `str`) under Python 3.x.
    """
    soup = BeautifulSoup(html_input, 'html.parser')
    img_tags = soup.findAll('img')
    img_map = {}
    num = 1
    basename = os.path.splitext(basename)[0]
    basename = basename.replace('.', '_')
    for tag in img_tags:
        src = tag.get('src', None)
        if src is None:
            continue
        if src in list(img_map.keys()):
            # We found a link to the same image already
            tag['src'] = img_map[src]
            continue
        scheme = urlparse(src)[0]
        if scheme not in ['file', '']:
            # only handle local files
            continue
        ext = ''
        if '.' in src:
            ext = os.path.splitext(src)[1]
        new_src = '%s_%s%s' % (basename, num, ext)
        num += 1
        tag['src'] = new_src
        img_map[src] = new_src
    return soup.decode(), img_map
Пример #23
0
def get_pdf_content(pages, toc):
    """
    :type pages: flask.ext.flatpages.flatpages.FlatPages
    :param pages:
    """
    content = []
    for toc_section in toc:
        section = {"id": toc_section["title"].replace(" ", "_"), "title": toc_section["title"], "content": []}
        for reference in toc_section["items"]:
            url = reference["url"]
            if url.startswith("/"):
                url = url[1:]
            if url.endswith(".html"):
                url = url[:-5]

            if url == "docs/reference/grammar":
                page_html = render_template("pages/grammar.html", kotlinGrammar=get_grammar()).replace("<br>", "<br/>")
                document = BeautifulSoup(page_html, "html.parser")
                document = document.find("div", {"class": "grammar"})
                page_id = "grammar"
                title = "Grammar"
            else:
                page = pages.get(url)
                if page is None:
                    continue
                title = page.meta["title"]
                document = BeautifulSoup(page.html, "html.parser")
                page_id = page.path.split("/")[-1]

            for element in document.find_all():
                if "id" in element.attrs:
                    element.attrs["id"] = page_id + "_" + element.attrs["id"]
                if element.name == "a":
                    if "href" not in element.attrs:
                        continue
                    href = element.attrs["href"]
                    url = urlparse(href)
                    if url.scheme == "":
                        if href.startswith("#"):
                            new_href = page_id + "_" + href[1:]
                        else:
                            url_path = url.path[:-5] if url.path.endswith(".html") else url.path
                            new_href = url_path + ("_" + url.fragment if url.fragment != "" else "")
                        element.attrs["href"] = "#" + new_href

                header_regex = re.compile("^h(\d)$")
                if header_regex.match(element.name):
                    level = int(header_regex.match(element.name).group(1)) + 1
                    element.name = "h" + str(level)

            section["content"].append({"id": page_id, "title": title, "content": document.decode()})
        content.append(section)
    drive, root_folder_path_rest = path.splitdrive(root_folder_path)
    page_html = render_template(
        "pdf.html", content=content, root_folder=(drive + root_folder_path_rest).replace("\\", "/")
    )
    return page_html
def content_object_init(instance):

    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content, 'html.parser')

        if 'img' in content:
            for img in soup('img'):
                logger.debug('Better Fig. PATH: %s', instance.settings['PATH'])
                logger.debug('Better Fig. img.src: %s', img['src'])

                img_path, img_filename = path.split(img['src'])

                logger.debug('Better Fig. img_path: %s', img_path)
                logger.debug('Better Fig. img_fname: %s', img_filename)

                # Strip off {filename}, |filename| or /static
                if img_path.startswith(('{filename}', '|filename|')):
                    img_path = img_path[10:]
                elif img_path.startswith('/static'):
                    img_path = img_path[7:]
                elif img_path.startswith('data:image'):
                    # Image is encoded in-line (not a file).
                    continue
                else:
                    logger.warning('Better Fig. Error: img_path should start with either {filename}, |filename| or /static')

                # Build the source image filename
                src = instance.settings['PATH'] + img_path + '/' + img_filename

                logger.debug('Better Fig. src: %s', src)
                if not (path.isfile(src) and access(src, R_OK)):
                    logger.error('Better Fig. Error: image not found: %s', src)

                # Open the source image and query dimensions; build style string
                im = Image.open(src)
                extra_style = 'width: {}px; height: auto;'.format(im.size[0])

                if 'RESPONSIVE_IMAGES' in instance.settings and instance.settings['RESPONSIVE_IMAGES']:
                    extra_style += ' max-width: 100%;'

                if img.get('style'):
                    img['style'] += extra_style
                else:
                    img['style'] = extra_style

                if img['alt'] == img['src']:
                    img['alt'] = ''

                fig = img.find_parent('div', 'figure')
                if fig:
                    if fig.get('style'):
                        fig['style'] += extra_style
                    else:
                        fig['style'] = extra_style

        instance._content = soup.decode()
def content_object_init(instance):

    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content)

        if "img" in content:
            for img in soup("img"):
                logger.debug("Better Fig. PATH: %s", instance.settings["PATH"])
                logger.debug("Better Fig. img.src: %s", img["src"])

                img_path, img_filename = path.split(img["src"])

                logger.debug("Better Fig. img_path: %s", img_path)
                logger.debug("Better Fig. img_fname: %s", img_filename)

                # Strip off {filename}, |filename| or /static
                if img_path.startswith(("{filename}", "|filename|")):
                    img_path = img_path[10:]
                elif img_path.startswith("/static"):
                    img_path = img_path[7:]
                else:
                    logger.warning(
                        "Better Fig. Error: img_path should start with either {filename}, |filename| or /static"
                    )

                # Build the source image filename
                src = instance.settings["PATH"] + img_path + "/" + img_filename

                logger.debug("Better Fig. src: %s", src)
                if not (path.isfile(src) and access(src, R_OK)):
                    logger.error("Better Fig. Error: image not found: {}".format(src))

                # Open the source image and query dimensions; build style string
                im = Image.open(src)
                extra_style = "width: {}px; height: auto;".format(im.size[0])

                if instance.settings["RESPONSIVE_IMAGES"]:
                    extra_style += " max-width: 100%;"

                if img.get("style"):
                    img["style"] += extra_style
                else:
                    img["style"] = extra_style

                if img["alt"] == img["src"]:
                    img["alt"] = ""

                fig = img.find_parent("div", "figure")
                if fig:
                    if fig.get("style"):
                        fig["style"] += extra_style
                    else:
                        fig["style"] = extra_style

        instance._content = soup.decode()
Пример #26
0
    def test_handle_content_no_dorks(self):
        self.handler.no_dorks = True

        async def test():
            self.return_content = await self.handler.handle_content(self.content)

        self.loop.run_until_complete(test())
        soup = BeautifulSoup(self.return_content, "html.parser")
        self.return_content = soup.decode("utf-8")
        self.assertEqual(self.return_content, self.no_dorks_content)
Пример #27
0
def search(page):
    search1 = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=java&sm=0&sf=10001&st=15000&we=0103&isfilter=1&fl=530&isadv=0&sg=c38460ed2e6c4041994dc4eaabf942ce&p="
    search2 = page
    search_url = search1 + str(search2)
    print(search_url)
    data = urllib.request.urlopen(search_url).read()
    soup = BeautifulSoup(data, "html.parser")
    soup.decode('UTF-8')
    strs = soup.findAll(name='td', attrs={"class":"gsmc"})
    count = len(strs)
    file = open("result.txt", 'a')
    for x in xrange(count):
        companyName = strs[x].string
        if companyName == None:
            companyName = '123'
        file.write(companyName + '\n')
    while(page ==3):
        break
        file.close()
def content_object_init(instance):
    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content)

        if 'table' in content:
            for ctbl in soup.find_all('table', class_="codehilitetable"):
                wrapper_tag = soup.new_tag('div')
                wrapper_tag['class'] = 'hilitewrapper'
                ctbl.wrap(wrapper_tag)
        instance._content = soup.decode()
Пример #29
0
    def test_handle_content(self):
        self.handler.no_dorks = False
        self.handler.get_dorks = AsyncMock(return_value=["test_dork1"])

        async def test():
            self.return_content = await self.handler.handle_content(self.content)

        self.loop.run_until_complete(test())
        soup = BeautifulSoup(self.return_content, "html.parser")
        return_content = soup.decode("utf-8")
        self.assertEqual(return_content, self.expected_content)
Пример #30
0
def bootstrapify(content):
    if isinstance(content, contents.Static):
        return

    replacements = content.settings['BOOTSTRAPIFY']
    soup = BeautifulSoup(content._content, 'html.parser')

    for selector, classes in replacements.items():
        replace_in_with(selector, soup, classes)

    content._content = soup.decode()
Пример #31
0
def main():
    wb = xlwt.Workbook()
    sheet = wb.add_sheet('sheet1')
    sheet.write(0, 0, 'categoryID')  #categoryID帮助中心
    sheet.write(0, 1, 'status')  #status未审核
    sheet.write(0, 2, 'recommend')  #recommend未推荐
    sheet.write(0, 3, 'type')  #type运营添加
    sheet.write(0, 4, 'tag')  #tag标签
    sheet.write(0, 5, 'source')  #文章来源
    sheet.write(0, 6, 'writer')  #作者
    sheet.write(0, 7, 'md5_id')
    sheet.write(0, 8, 'title')
    sheet.write(0, 9, 'description')
    sheet.write(0, 10, 'content')

    content_url_list = [
        url.strip() for url in open('jrhelp.jd.com_index_detail_url.txt')
    ]
    #https://article.jd.com/?id=987009

    service_args = []
    service_args.append('--load-images=no')  #关闭图片加载
    service_args.append('--disk-cache=yes')  #开启缓存
    service_args.append('--ignore-ssl-errors=true')  #忽略https错误

    browser = webdriver.PhantomJS(service_args=service_args)

    browser.implicitly_wait(30)  #设置超时时间
    browser.set_page_load_timeout(30)  #设置超时时间

    for index, k in enumerate(content_url_list):
        content_str = ''
        try:
            url = 'https://article.jd.com/?id=' + str(
                k.strip())  #获取文件中的url,具体根据txt里字段定
            print index, url
        except Exception, e:
            print e

        #抓正文规则
        try:
            #browser = webdriver.PhantomJS(executable_path=r'D:\programfiles\anaconda\Lib\site-packages\selenium\webdriver\phantomjs\bin\phantomjs.exe')
            browser.get(url)

            content_main = browser.find_element_by_class_name(
                "detail_cont_main").get_attribute('innerHTML')
            s = BeautifulSoup(content_main, "lxml")

            #删除商品部分,type=3, 删除头部标题部分
            content = [
                soup.extract() for soup in s(
                    'div', attrs={"class": "detail_cm_item detail_cm_goods"})
            ]
            content = [
                soup.extract()
                for soup in s('div', attrs={"class": "detail_cm_head"})
            ]

            #因为懒加载,导致img的src值不是图片的url地址,需要处理
            imatag = s.find_all('img')
            for itag in imatag:
                if '1x1' in itag.get('src'):
                    itag['src'] = itag.get('data-lazy-img')
                    itag['data-lazy-img'] = 'done'

            content_txt = str(s.get_text()).decode('utf-8')[0:250]  #

            #替换掉不需要的标签
            s = str(s).replace('<html>', '').replace('</html>', '').replace(
                '<body>', '').replace('</body>', '')

            title = browser.find_element_by_tag_name("h3").text
            '''
			txttag = browser.find_elements_by_class_name("detail_cm_text")#加个s
			pictag = browser.find_elements_by_class_name("detail_cm_pic")#加个s
			
			for t in txttag:
				content_str +=  '</p>'+t.text+'</p>'		
				
			for pic in pictag:
				print pic.get_attribute('innerHTML')
			'''

            m1 = md5.new()
            m1.update(title)
            md5_str = m1.hexdigest()[8:-8]  #取中间16位

            content = s.decode('utf-8')

            sheet.write(index + 1, 0, '148')  #categoryID帮助中心
            sheet.write(index + 1, 1, '1')  #status未审核
            sheet.write(index + 1, 2, '1')  #recommend未推荐
            sheet.write(index + 1, 3, '0')  #type运营添加
            sheet.write(index + 1, 4, 'jd')  #tag标签
            sheet.write(index + 1, 5, 'jd')  #文章来源
            sheet.write(index + 1, 6, 'jd')  #作者
            sheet.write(index + 1, 7, md5_str)
            sheet.write(index + 1, 8, title)
            if content_txt:
                sheet.write(index + 1, 9, content_txt)
            else:
                sheet.write(index + 1, 9, title)

            if len(content) < 32767:
                sheet.write(index + 1, 10, content)
            else:
                sheet.write(index + 1, 10,
                            'String longer than 32767 characters')
            wb.save("result.xls")

        except Exception, e:
            print e
Пример #32
0
import xml.etree.cElementTree as ET
from urllib.request import urlopen

import ssl
from bs4 import BeautifulSoup

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tree = ET.fromstring(soup.decode())
lst = tree.findall('comments/comment')
sum = 0
for item in lst:
    sum += int(item.find('count').text)
    print(sum)
print(sum)
Пример #33
0
while True:

    cur.execute('''
    SELECT wind FROM Winds WHERE year = ? AND month = ?''', (year, month))
    row = cur.fetchone()

    aux_url = str(year) + '-'
    if month < 10: aux_url = aux_url + '0'
    aux_url = aux_url + str(month) + '.txt'
    url = core_url + aux_url
    html = urllib.request.urlopen(url, context=ctx).read()
    #    print(html)
    soup = BeautifulSoup(html, 'html.parser')
    print(soup)
    data = soup.decode()
    wind = re.findall(expression, data)
    #    print(wind)
    w = list()
    [w.append(float(x)) for x in wind]
    max_wind = max(w)
    #    print(max_wind)

    cur.execute(
        '''
    INSERT INTO Winds (year, month, wind) VALUES (?, ?, ?)''',
        (year, month, max_wind))

    if (month == int(end_month)) and (year == int(end_year)): break

    if month == 12:
Пример #34
0
# To run this, you can install BeautifulSoup
# https://pypi.python.org/pypi/beautifulsoup4

# Or download the file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

#url = input('Enter - ')
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url, context=ctx).read()
print(html.decode())
soup = BeautifulSoup(html, 'html.parser')
print(soup.decode())
# Retrieve all of the anchor tags
tags = soup('span')
for tag in tags:
    print(tag.contents[0])
Пример #35
0
def get_html_from_filepath(filepath,
                           start=0,
                           end=None,
                           preprocessors=[],
                           template=None):
    """Return the HTML from a Jupyter Notebook
    """
    template_file = 'basic'
    extra_loaders = []
    if template:
        extra_loaders.append(
            jinja2.FileSystemLoader([os.path.dirname(template)]))
        template_file = os.path.basename(template)

    config = get_config()
    config.update({
        'CSSHTMLHeaderTransformer': {
            'enabled': True,
            'highlight_class': '.highlight-ipynb'
        },
        'SubCell': {
            'enabled': True,
            'start': start,
            'end': end
        }
    })
    exporter = HTMLExporter(config=config,
                            template_file=template_file,
                            extra_loaders=extra_loaders,
                            filters={'highlight2html': custom_highlighter},
                            preprocessors=[SubCell] + preprocessors)

    config.CSSHTMLHeaderPreprocessor.highlight_class = " .highlight pre "
    content, info = exporter.from_filename(filepath)
    from bs4 import NavigableString

    if BeautifulSoup:
        soup = BeautifulSoup(content, 'html.parser')
        for i in soup.findAll('div', {'class': 'prompt input_prompt'}):
            i.decompose()
        for i in soup.findAll('div', {'class': 'prompt output_prompt'}):
            i.decompose()
        for i in soup.findAll('div', {'class': 'prompt'}):
            i.decompose()
        for i in soup.findAll('a', {'class': 'anchor-link'}):
            i.decompose()

        for i in soup.findAll('code'):
            i.attrs['class'] = 'code-class'
        content = soup.decode(formatter=None)
        #     url = 'http://localhost:8800/list-comprehension.html'
        #     page = requests.get(url)
        #     soup = BeautifulSoup(page.content, 'html.parser')

        soup = BeautifulSoup(content, 'html.parser')
        pre_tags = soup.find_all('div', {'class': 'input_area'})
        input_areas = [i for i in pre_tags if i['class'][0] == 'input_area']
        output = '\r\n'.join([i.get_text() for i in input_areas])
        new_div = soup.new_tag('textarea')
        new_div['text'] = output
        new_div['id'] = "myInput"
        new_div['type'] = "text"
        new_div['class'] = "codecopy"
        new_div.insert(0, NavigableString(output))

        soup.insert(0, new_div)
        content = soup.decode(formatter=None)
    return content, info
Пример #36
0
def getContent():
    
      
            
            """收集内容"""
            """ 你的 APPID AK SK """
            APP_ID = '14658509'
            API_KEY = 'C14bCL7NkReQpak382maUYXi'
            SECRET_KEY = '8vWAXHBTmfL3r96PlKIggpwuXwdNl4wz'
            client = AipNlp(APP_ID, API_KEY, SECRET_KEY)
            #[1网址,2标题,3内容,4情感分析items词典,5公司名列表,6评论观点列表,7文章分类,8文章标签]
            #http://linyi.iqilu.com/caijing/2018/1117/4113682.shtml
            #monitor_result=[]
            for news_url in urls:                
                one_monitor=[]
                one_monitor.append(news_url)#①网址
                try:#确保一条新闻具有完整性
                    news=urlopen(news_url,timeout=15)#设置timeout后,urlopen不会一直等待网址响应、也就不会出现卡死现象
                    news_html=news.read()#str类型的网页源码,这条指令和parse冲突,不能同时运行
                    #response = requests.get('http://example.com')
                    #doc = Document(response.text)
                except:
                    one_monitor.append("urlopen_error")
                    monitor_result.append(one_monitor) 
                    success_num +=1
                    print("打开网址错误")
                    continue
                try:#③内容,评论观点抽取最大就3000字
                    news_contents=Document(news_html)    
                    news_title=news_contents.title().strip(" ")[:39].encode("utf-8")#②标题,此处如果用默认的ascii转码、由于超出范围会报错
                    #print(news_title)#则删除空白符(包括'\n', '\r',  '\t',  ' ')
                    one_monitor.append(news_title)
                    news_content=BeautifulSoup(news_contents.summary()).get_text().strip(" ")[:2000].encode("utf-8")
                    #len(news_content)#print(news_content)
                    one_monitor.append(news_content)
                    emotion_content=news_content.decode("utf-8")[:500].encode("utf-8")#要防止str只截取定长字节而有不完整汉字
                    #print(emotion_content)
                except:
                    one_monitor.append("extract_error")
                try:             
                    #print(emotion_content)  #print(u"我很高兴"[:1000])#我很高兴
                    emotion=client.sentimentClassify(emotion_content)["items"]#④情感
                    one_monitor.append(emotion)
                except:
                    one_monitor.append("emotion_error")
                try:#⑤机构名列表
#                    ids = [1,4,3,3,4,2,3,4,5,6,1]
#                    list(set(ids))#结果是重新排序的
                    orgs=[item["item"].encode("utf-8") for item in client.lexer(news_content)["items"] if item["ne"] =="ORG"]
                    one_monitor.append(";".join(list(set(orgs))))
                    #print(";".join(list(set(orgs))))
                except:
                    one_monitor.append("org_error")
                try:#⑥评论观点列表
                    conments=[item['abstract'].encode("utf-8") for item in client.commentTag(news_content)['items']]
                    one_monitor.append(";".join(list(set(conments))))
                    #print(";".join(list(set(conments))))
                except:
                    one_monitor.append("comment_error")
                try:#⑦文章分类
#                    a=[[1,2],[4,3,5]]
#                    [c for b in a for c in b]
                    group=client.topic(news_title, news_content)["item"].values()#[[字典],[字典]]
                    #group=client.topic("对严重失信者,能否限制其发预付卡?法学家谈如何破解预付卡立法瓶颈", news_content)["item"].values()
                    value_list=[dic[u'tag'] for dic_list in group for dic in dic_list]#float类型不能参与join
                    one_monitor.append(u";".join(value_list).encode("utf-8"))
                    #print(u";".join(value_list).encode("utf-8"))
                except:
                    one_monitor.append("topic_error")
                try:#⑧文章标签
                    keyword=client.keyword(news_title, news_content)["items"]#[字典]
                    #keyword=client.keyword("对严重失信者,能否限制其发预付卡?法学家谈如何破解预付卡立法瓶颈", news_content)["items"]
                    key_list=[dic[u'tag'] for dic in keyword]
                    one_monitor.append(u";".join(key_list).encode("utf-8"))                   
                    #print(u";".join(key_list).encode("utf-8"))
                    print("成功%s"%success_num)
                except: 
                    one_monitor.append("keyword_error")
                    error_num +=1
                    print("其中有误%s"%error_num)                                   
                    
                monitor_result.append(one_monitor) 
                success_num +=1
                #time.sleep(1)
                
                if success_num % 200 == 0:#要定期保存,防止功亏一篑
                    with open("./temp/risk_monitoring%s.csv"%index,"w") as reader:
                        writer = csv.writer(reader)
                        writer.writerows(monitor_result)
Пример #37
0
 def clean(cls, html):
     """Clean up an HTML string to contain just the allowed elements /
     attributes"""
     doc = BeautifulSoup(html, 'html5lib')
     cls.clean_node(doc, doc)
     return doc.decode()
Пример #38
0
class SiteScramble:
    def __init__(self, html, noise_level, filenum, url):
        self.filenum = filenum
        self.soup = BeautifulSoup(html)
        self.noise_level = noise_level
        self.url = url
        self.css_file_list = []
        self.att_dict = None
        self.cwd = os.getcwd()
        self.html = ''

    def _change_image_hrefs(self):
        '''
        Changes hrefs in HTML so they have absolute paths instead
        of relative for images.
        This way we can open the file locally with the right path.  
        @returns: str - html with absolute paths 
        '''
        imgEls = self.soup.findAll("img")
        for el in imgEls:
            try:
                if el["src"] and el["src"][0] == '/':
                    el["src"] = url + el["src"]
            except:
                pass
        #return str(self.soup).decode('utf-8')
        return str(self.soup)

    def _get_color(self,html_text):
        '''
        Extracts CSS color code from HTML
        @param html_text - HTML fragment with CSS in it   
        @returns: str - CSS color code 
        '''
        color = None
        text_split = html_text.split(';')
        color = [x for x in text_split if 'color' in x]

        if color:
            color = color[0].split(':')[1]
        return color

    def _get_atts_to_change(self):
        '''
        Searches through HTML for colors, images, and font-sizes we
        want to change later.
        '''

        self.html = self._change_image_hrefs()

        att_dict = {}
        colors = []
        images = []
        font_sizes = []

        if self.soup.findAll('body')[0].get('bgcolor'):
             colors.append(self.soup.findAll('body')[0]['bgcolor'])

        pEls = self.soup.findAll('p')

        for p in pEls:
            if p.get('style') and 'color' in p['style']:
                c = self._get_color(p['style'])
                if c not in colors:
                     colors.append(c)

        imgEls = self.soup.findAll('img')

        for img in imgEls:
            if img.get('height') and img.get('width') and [img.get('height'),img.get('width')] not in images:
                 images.append([img.get('height'),img.get('width')])

        allEls = self.soup.findAll()

        #print len(allEls)
        for el in allEls:
        
            if el.get('style') and 'font-size' in el.get('style'):
                font_sizes.append(el.get('style'))

        if colors:
            att_dict['colors'] = colors
        if images:
            att_dict['images'] = images
        if font_sizes:
            att_dict['font-sizes'] = font_sizes
        self.att_dict = att_dict

    def check_valid_css(self,html_string):
        '''
        Checks to see if string of HTML is a valid CSS code
        '''

        valid_chars = ['a', 'b', 'c', 'd', 'e', 'f']
        valid_chars = valid_chars + [str(x) for x in range(10)]
        is_valid = True

        for h in html_string:
            if h not in valid_chars:
                is_valid = False

        return is_valid
    

    def _change_css(self,css_file, index, num):
        '''
        Goes through a CSS file and replaces color with
        a new one
        @param css_file - CSS file
        @param index - Index of file we are on
        '''
        if not self.att_dict:
            self._get_atts_to_change()

        if 'http' not in css_file:
            css_file = self.url + css_file
        r = requests.get(css_file)
        css_html = r.text
        all_matches = re.findall(r'#.{6}', css_html, re.MULTILINE)
        valid_css = list(set(x for x in all_matches if self.check_valid_css(x.replace('#',''))))
        
        for val in valid_css:
            new_color = self._change_color(val, self.noise_level)
            css_html = css_html.replace(val, new_color)
        with open('output/newcss' + str(num) + str(index) + '.css', 'w') as f:
            #f.write(css_html.encode('utf-8'))
            f.write(str(css_html))

    def get_css(self):
        '''
        Extracts CSS files from HTML
        '''

        allEls = self.soup.findAll('head')[0].findChildren()
        css_file_list = []
        index = 0

        for p in allEls:
            if p.get('href') and '.css' in p.get('href'):
                self._change_css(p.get('href'), index, self.filenum)
                css_file_list.append(p.get('href'))
                index += 1
        self.css_file_list = css_file_list

    

    def _change_color(self,orig_color, color_range):
        '''
        Scrambles CSS color code to produce new color
        @param orig_color - original color used by HTML 
        @param color_range - maximum range of change to CSS color 
        @returns: str - new color code 
        '''
        hex_list = list(orig_color.replace('#','').lower())
        color_num = {'a':10, 'b':11, 'c':12, 'd':13, 'e':14, 'f':15,
                10:'a', 11:'b', 12:'c', 13:'d', 14:'e', 15:'f'}
        new_color = []

        try:
            for hex_val in hex_list:
                if hex_val.isalpha():
                    hex_val = color_num[hex_val]
                bump = randint(0,color_range)
                new_val = (int(hex_val) + bump) % 16
                if new_val > 9:
                   new_val = color_num[new_val]
                new_color.append(new_val)

            return '#' + ''.join(str(n) for n in new_color)
        except Exception as e:
            print('error changing color' + str(e))
            return orig_color

    def change_image_hrefs(self):
        '''
        Appends main URL to relative image hrefs to make them 
        absolute
        '''    
        imgEls = self.soup.findAll("img")
        for el in imgEls:
            try:
                if el["src"] and el["src"][0] == '/':
                     el["src"] = self.url + el["src"]
            except:
                 pass
        html = str(self.soup.decode('utf-8'))

    def change_css_files(self):
        '''
        Makes new CSS files and replaces the paths with the old
        CSS files with the new ones
        '''
        for index,css_file in enumerate(self.css_file_list):
            newfile = 'file://' + self.cwd + '/output/' + 'newcss' + str(self.filenum) + str(index) + '.css'
            self.html = self.html.replace(css_file,newfile)

    def _change_font_size(self,fs):
        '''
        Scrambles CSS font size to produce new font size
        @param fs - original font size used by HTML 
        @returns: str - new font size (or old one if we error)
        '''
        negate = randint(0,1)
        change_level = randint(0,self.noise_level)
        font_size_split = fs.split(';')
        size = ''
        px = False
        pt = False
        percent = False
        for fss in font_size_split:
            if 'font-size' in fss:
                size = fss.replace('font-size:','')
                if '%' in size:
                    size = size.replace('%','')
                    percent = True
                if 'pt' in size:
                    size = size.replace('pt','').strip()
                    pt = True
                if 'px' in size:
                    size = size.replace('px','').strip()
                    px = True
        if size:
            try:
                new_size = int(size)
            except:        
                size = randint(1,9)
            new_fs = int(size) * change_level
            if negate == 0:
                new_fs = new_fs + int(size)
            new_fs = 'font-size: ' + str(new_fs)
            if percent:
                new_fx = new_fs + '%'
            if px:
                new_fs = new_fs + 'px'
            if pt:
                new_fs = new_fs + 'pt'
        else:
            new_fs = fs
        return new_fs


    def _change_image_size(self,im):
        '''
        Scrambles image size to produce new size
        @param im - original image size by HTML 
        @returns: str - new image size 
        '''
        negate = randint(0,1)
        change_level = randint(0,self.noise_level)
        new_height = im[0] * change_level
        new_width = im[1] * change_level
        if negate == 0:
            new_height = new_height + im[0]
            new_width = new_width + im[0]
        return (int(new_height), int(new_width))


    def scramble_colors(self):
        '''
        Scrambles CSS colors in HTML
        '''
        self.get_css()
        if not self.att_dict:
            self._get_atts_to_change()
        self.change_css_files()
        if 'colors' in self.att_dict:
            for c in self.att_dict['colors']:
                newc = self._change_color(c,self.noise_level)
                self.html = self.html.replace(c,newc)

    def scramble_image_sizes(self):
        '''
        Scrambles image sizes in HTML
        '''
        if not self.att_dict:
            self._get_atts_to_change()
            
        if 'images' in self.att_dict:
            for im in self.att_dict['images']:
                newim = self._change_images(im)
                self.html = self.html.replace('height="' + im[0],'height="' + str(newim[0]))
                self.html = self.html.replace('width="' + im[1],'width="' + str(newim[1]))


    def scramble_font_sizes(self):
        '''
        Scrambles font sizes in HTML
        '''
        if not self.att_dict:
            self._get_atts_to_change()
            
        if 'font-sizes' in self.att_dict:
            for fs in self.att_dict['font-sizes']:
                newfs = self._change_font_size(fs)
                self.html = self.html.replace(fs,newfs)
Пример #39
0
	if tag.name in tags:
		pass
	else:
		tag.hidden = True    #让标签隐藏
		tag.clear()     #清空
		continue

	input_attrs = tag.attrs      # {'class': 'c1', 'id': 'i1'}
	valid_attrs = tags[tag.name] # ['class']
	for k in list(input_attrs.keys()):
		if k in valid_attrs:
			pass
		else:
			del tag.attrs[k]

content = soup.decode()    #
print(content)

# pip3 install beatifulsoup4
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(content, 'html.parser')
# tag = soup.find('script')
# tag.hidden = True
# tag.clear()
#
# span = soup.find('span')
# # print(span.attrs)
# del span.attrs['style']
#
# content = soup.decode()
# print(content)
Пример #40
0
def get_pdf_content(pages, toc):
    """
    :type pages: flask.ext.flatpages.flatpages.FlatPages
    :param pages:
    """
    content = []
    for toc_section in toc:
        section = {
            'id': toc_section['title'].replace(' ', '_'),
            'title': toc_section['title'],
            'content': []
        }
        for reference in toc_section['items']:
            url = reference['url']
            if url.startswith('/'):
                url = url[1:]
            if url.endswith('.html'):
                url = url[:-5]

            if url == "docs/reference/grammar":
                page_html = render_template(
                    'pages/grammar.html',
                    kotlinGrammar=get_grammar()).replace("<br>", "<br/>")
                document = BeautifulSoup(page_html, 'html.parser')
                document = document.find("div", {"class": "grammar"})
                page_id = "grammar"
                title = "Grammar"
            else:
                page = pages.get(url)
                if page is None:
                    continue
                title = page.meta['title']
                document = BeautifulSoup(page.html, 'html.parser')
                page_id = page.path.split('/')[-1]

            for element in document.find_all():
                if 'id' in element.attrs:
                    element.attrs['id'] = page_id + '_' + element.attrs['id']
                if element.name == "a":
                    if 'href' not in element.attrs:
                        continue
                    href = element.attrs['href']
                    url = urlparse(href)
                    if url.scheme == "":
                        if href.startswith('#'):
                            new_href = page_id + '_' + href[1:]
                        else:
                            url_path = url.path[:-5] if url.path.endswith(
                                ".html") else url.path
                            new_href = url_path + ('_' + url.fragment if
                                                   url.fragment != "" else "")
                        element.attrs['href'] = "#" + new_href

                header_regex = re.compile('^h(\d)$')
                if header_regex.match(element.name):
                    level = int(header_regex.match(element.name).group(1)) + 1
                    element.name = 'h' + str(level)

            section['content'].append({
                'id': page_id,
                'title': title,
                'content': document.decode()
            })
        content.append(section)
    drive, root_folder_path_rest = path.splitdrive(root_folder_path)
    page_html = render_template('pdf.html',
                                content=content,
                                root_folder=drive + root_folder_path_rest)
    return page_html
Пример #41
0
def content_object_init(instance):

    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content, 'html.parser')

        for img in soup(['img', 'object']):
            logger.debug('Better Fig. PATH: %s', instance.settings['PATH'])
            if img.name == 'img':
                logger.debug('Better Fig. img.src: %s', img['src'])
                img_path, img_filename = path.split(img['src'])
            else:
                logger.debug('Better Fig. img.data: %s', img['data'])
                img_path, img_filename = path.split(img['data'])
            logger.debug('Better Fig. img_path: %s', img_path)
            logger.debug('Better Fig. img_fname: %s', img_filename)

            # If the image already has attributes... then we can skip it. Assuming it's already optimised
            if 'style' in img.attrs:
                sheet = cssutils.parseStyle(img['style'])
                if len(sheet.width) > 0 or len(sheet.height) > 0:
                    continue

            # Pelican 3.5+ supports {attach} macro for auto copy, in this use case the content does not exist in output
            # due to the fact it has not been copied, hence we take it from the source (same as current document)
            src = None
            if img_filename.startswith('{attach}'):
                img_path = os.path.dirname(instance.source_path)
                img_filename = img_filename[8:]
                src = os.path.join(img_path, img_filename)
            elif img_path.startswith(('{filename}', '|filename|')):
                # Strip off {filename}, |filename| or /static
                img_path = img_path[10:]
            elif img_path.startswith('/static'):
                img_path = img_path[7:]
            elif img_path.startswith('data:image'):
                # Image is encoded in-line (not a file).
                continue
            else:
                # Check the location in the output as some plugins create them there.
                output_path = path.dirname(instance.save_as)
                image_output_location = path.join(
                    instance.settings['OUTPUT_PATH'], output_path,
                    img_filename)
                if path.isfile(image_output_location):
                    src = image_output_location
                    logger.info(
                        '{src} located in output, missing from content.'.
                        format(src=img_filename))
                else:
                    logger.warning(
                        'Better Fig. Error: img_path should start with either {attach}, {filename}, |filename| or /static'
                    )

            if src is None:
                # search src path list
                # 1. Build the source image filename from PATH
                # 2. Build the source image filename from STATIC_PATHS

                # if img_path start with '/', remove it.
                img_path = os.path.sep.join(
                    [el for el in img_path.split("/") if len(el) > 0])

                # style: {filename}/static/foo/bar.png
                src = os.path.join(instance.settings['PATH'], img_path,
                                   img_filename)
                src_candidates = [src]

                # style: {filename}../static/foo/bar.png
                src_candidates += [
                    os.path.join(instance.settings['PATH'], static_path,
                                 img_path, img_filename)
                    for static_path in instance.settings['STATIC_PATHS']
                ]

                src_candidates = [
                    f for f in src_candidates
                    if path.isfile(f) and access(f, R_OK)
                ]

                if not src_candidates:
                    logger.error('Better Fig. Error: image not found: %s', src)
                    logger.debug('Better Fig. Skip src: %s',
                                 img_path + '/' + img_filename)
                    continue

                src = src_candidates[0]
            logger.debug('Better Fig. src: %s', src)

            # Open the source image and query dimensions; build style string
            try:
                if img.name == 'img':
                    im = Image.open(src)
                    extra_style = 'width: {}px; height: auto;'.format(
                        im.size[0])
                else:
                    svg = pysvg.parser.parse(src)
                    extra_style = 'width: {}px; height: auto;'.format(
                        svg.get_width())
            except IOError as e:
                logger.debug('Better Fig. Failed to open: %s', src)
                extra_style = 'width: 100%; height: auto;'

            if 'RESPONSIVE_IMAGES' in instance.settings and instance.settings[
                    'RESPONSIVE_IMAGES']:
                extra_style += ' max-width: 100%;'

            if img.get('style'):
                img['style'] += extra_style
            else:
                img['style'] = extra_style

            if img.name == 'img':
                if img['alt'] == img['src']:
                    img['alt'] = ''

            fig = img.find_parent('div', 'figure')
            if fig:
                if fig.get('style'):
                    fig['style'] += extra_style
                else:
                    fig['style'] = extra_style

        instance._content = soup.decode()
def content_object_init(instance):

    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content)

        if 'img' in content:
            for img in soup('img'):
                logger.debug('Better Fig. PATH: %s', instance.settings['PATH'])
                logger.debug('Better Fig. img.src: %s', img['src'])

                img_path, img_filename = path.split(img['src'])

                logger.debug('Better Fig. img_path: %s', img_path)
                logger.debug('Better Fig. img_fname: %s', img_filename)

                # Strip off {filename}, |filename| or /static
                if img_path.startswith(('{filename}', '|filename|')):
                    img_path = img_path[10:]
                elif img_path.startswith('/static'):
                    img_path = img_path[7:]
                else:
                    logger.warning(
                        'Better Fig. Error: img_path should start with either {filename}, |filename| or /static'
                    )

                # Build the source image filename
                src = instance.settings['PATH'] + img_path + '/' + img_filename

                logger.debug('Better Fig. src: %s', src)
                if not (path.isfile(src) and access(src, R_OK)):
                    logger.error(
                        'Better Fig. Error: image not found: {}'.format(src))

                # Open the source image and query dimensions; build style string
                im = Image.open(src)
                #img_extra_style = 'width: {}px; height: {}px;'.format(im.size[0], im.size[1])
                #fig_extra_style = 'width: {}px; height: auto;'.format(im.size[0])
                extra_style = 'width: {}px; height: auto;'.format(im.size[0])

                if instance.settings['RESPONSIVE_IMAGES']:
                    # img_extra_style += ' max-width: 100%;'
                    # fig_extra_style += ' max-width: 100%;'
                    extra_style += ' max-width: 100%;'

                if img.get('style'):
                    #img['style'] += img_extra_style
                    img['style'] += extra_style
                else:
                    #img['style'] = img_extra_style
                    img['style'] = extra_style

                if img['alt'] == img['src']:
                    img['alt'] = ''

                fig = img.find_parent('div', 'figure')
                if fig:
                    if fig.get('style'):
                        #fig['style'] += fig_extra_style
                        fig['style'] += extra_style
                    else:
                        #fig['style'] = fig_extra_style
                        fig['style'] = extra_style

        instance._content = soup.decode()
Пример #43
0
def content_object_init(instance):

    if instance._content is not None:
        content = instance._content
        soup = BeautifulSoup(content, 'html.parser')

        if 'img' in content:
            for img in soup('img'):
                logger.debug('Better Fig. PATH: %s', instance.settings['PATH'])
                logger.debug('Better Fig. img.src: %s', img['src'])

                img_path, img_filename = path.split(img['src'])

                logger.debug('Better Fig. img_path: %s', img_path)
                logger.debug('Better Fig. img_fname: %s', img_filename)

                # Strip off {filename}, |filename| or /static
                if img_path.startswith(('{filename}', '|filename|')):
                    img_path = img_path[10:]
                elif img_path.startswith('/static'):
                    img_path = img_path[7:]
                elif img_path.startswith('data:image'):
                    # Image is encoded in-line (not a file).
                    continue
                else:
                    logger.warning(
                        'Better Fig. Error: img_path should start with either {filename}, |filename| or /static'
                    )

                # search src path list
                # 1. Build the source image filename from PATH
                # 2. Build the source image filename from STATIC_PATHS

                # if img_path start with '/', remove it.
                img_path = os.path.sep.join(
                    [el for el in img_path.split("/") if len(el) > 0])

                # style: {filename}/static/foo/bar.png
                src = os.path.join(instance.settings['PATH'], img_path,
                                   img_filename)
                src_candidates = [src]

                # style: {filename}../static/foo/bar.png
                src_candidates += [
                    os.path.join(instance.settings['PATH'], static_path,
                                 img_path, img_filename)
                    for static_path in instance.settings['STATIC_PATHS']
                ]

                src_candidates = [
                    f for f in src_candidates
                    if path.isfile(f) and access(f, R_OK)
                ]

                if not src_candidates:
                    logger.error('Better Fig. Error: image not found: %s', src)
                    logger.debug('Better Fig. Skip src: %s',
                                 img_path + '/' + img_filename)
                    continue

                src = src_candidates[0]
                logger.debug('Better Fig. src: %s', src)

                # Open the source image and query dimensions; build style string
                im = Image.open(src)
                extra_style = 'width: {}px; height: auto;'.format(im.size[0])

                if 'RESPONSIVE_IMAGES' in instance.settings and instance.settings[
                        'RESPONSIVE_IMAGES']:
                    extra_style += ' max-width: 100%;'

                if img.get('style'):
                    img['style'] += extra_style
                else:
                    img['style'] = extra_style

                if img['alt'] == img['src']:
                    img['alt'] = ''

                fig = img.find_parent('div', 'figure')
                if fig:
                    if fig.get('style'):
                        fig['style'] += extra_style
                    else:
                        fig['style'] = extra_style

        instance._content = soup.decode()
    def scrape(self, delta=True):
        if self.isUpdate or self.isDLC:
            return
        try:
            if (not delta or not self.bannerUrl):
                id = self.id

                if id in titleRedirects:
                    id = titleRedirects[id]

                cookies = {'esrb.verified': 'true'}
                for region in ['JP', 'AU']:
                    result = grabCachedRedirectUrl(
                        "https://ec.nintendo.com/apps/%s/%s" % (id, region),
                        cookies=cookies)
                    _json = ''
                    if not result or result.status_code != 200:
                        continue

                    _json = json.loads(
                        result.text.split('NXSTORE.titleDetail.jsonData = ')
                        [1].split('NXSTORE.titleDetail')[0].replace(';', ''))

                    if _json == '' or _json == None:
                        Print.error('Failed to parse json for ' +
                                    "https://ec.nintendo.com/apps/%s/%s" %
                                    (id, region))
                        continue

                    if 'hero_banner_url' in _json:
                        self.bannerUrl = _json['hero_banner_url']

                    if "release_date_on_eshop" in _json:
                        self.releaseDate = int(
                            _json["release_date_on_eshop"].replace('-', ''))
                    if "id" in _json:
                        self.nsuId = int("%s" % _json["id"])

                    if "formal_name" in _json:
                        self.name = _json["formal_name"].strip()

                    if 'screenshots' in _json:
                        self.screenshots = []
                        for i, k in enumerate(_json["screenshots"]):
                            self.screenshots.append(k["images"][0]["url"])

                    if "demos" in _json:
                        for demo in _json["demos"]:
                            if "id" in demo:
                                if id[0:12] != _json['applications'][0]['id'][
                                        0:12]:
                                    self.nsuId = int(demo["id"])
                                    if "name" in demo:
                                        self.name = demo["name"].strip()

                    if "languages" in _json:
                        self.languages = []
                        for language in _json["languages"]:
                            self.languages.append(language['iso_code'])

                    if "genre" in _json:
                        self.category = _json["genre"].split(' / ')

                    if "total_rom_size" in _json:
                        self.size = _json["total_rom_size"]

                    if "rating_info" in _json:
                        if "rating" in _json["rating_info"]:
                            if "age" in _json["rating_info"]['rating']:
                                self.rating = _json["rating_info"]['rating'][
                                    'age']

                        if "content_descriptors" in _json["rating_info"]:
                            content = []
                            for descriptor in _json["rating_info"][
                                    "content_descriptors"]:
                                content.append(descriptor['name'])
                            self.ratingContent = content

                    if "player_number" in _json:
                        if 'local_max' in _json["player_number"]:
                            self.numberOfPlayers = _json["player_number"][
                                "local_max"]

                        if 'offline_max' in _json["player_number"]:
                            self.numberOfPlayers = _json["player_number"][
                                "offline_max"]

                    if "publisher" in _json:
                        if 'name' in _json["publisher"]:
                            self.publisher = _json["publisher"]["name"]
                        if 'title' in _json["publisher"]:
                            self.publisher = _json["publisher"]["title"]

                    if "applications" in _json:
                        if "image_url" in _json["applications"][0]:
                            self.iconUrl = _json["applications"][0][
                                'image_url']

                    if "catch_copy" in _json:
                        intro = re.sub('(?<!\n)\n(?!\n)', ' ',
                                       _json["catch_copy"])
                        intro = re.sub('  ', ' ', intro)
                        self.intro = intro

                    if "description" in _json:
                        desc = re.sub('(?<!\n)\n(?!\n)', ' ',
                                      _json["description"])
                        desc = re.sub('  ', ' ', desc)
                        self.description = desc

                #<img aria-hidden="true" data-src="https://media.nintendo.com/nintendo/bin/ZppwWK6BnjH5twBNvE5wEEI9aeMGR0XX/hQGr97SGMnlXBWoqOBtgtGX5noK3tNtD.jpg"/>
                result = grabCachedRedirectUrl(
                    "https://ec.nintendo.com/apps/%s/US" % id, cookies=cookies)
                if result and result.status_code == 200:
                    if result.url != 'https://www.nintendo.com/games/':
                        soup = BeautifulSoup(result.text, "html.parser")

                        if not self.bannerUrl:
                            m = re.search(
                                r"#hero\s*{\s*background(-image)?:\s*url\('([^)]+)'\)",
                                result.text, re.DOTALL | re.UNICODE
                                | re.MULTILINE | re.IGNORECASE)
                            if m:
                                banner = m.group(2)
                                if banner[0] == '/':
                                    banner = 'https://www.nintendo.com' + banner
                                self.bannerUrl = banner

                        rem = re.finditer(
                            '<img aria-hidden="true" data-src="([^"]+)"',
                            result.text)
                        if rem:
                            ss = []
                            for m in rem:
                                ss.append(m.group(1))

                            if len(ss) > 0:
                                self.screenshots = ss

                        if soup.find("meta", {"property": "og:url"}) != None:
                            slug = soup.find("meta",
                                             {"property": "og:url"
                                              })["content"].split('/')[-1]
                            infoJson = json.loads(
                                requests.get(
                                    "https://www.nintendo.com/json/content/get/game/%s"
                                    % slug,
                                    cookies=cookies).text)["game"]

                            if "release_date" in infoJson:
                                self.releaseDate = int(
                                    datetime.datetime.strftime(
                                        datetime.datetime.strptime(
                                            infoJson["release_date"],
                                            "%b %d, %Y"), '%Y%m%d'))

                            if "name" in infoJson:
                                self.name = infoJson["name"].strip()

                            if "nsuid" in infoJson:
                                self.nsuId = int(infoJson["nsuid"])

                            catagories = []
                            if "game_category_ref" in infoJson:
                                catindex = 0
                                if "name" in infoJson["game_category_ref"]:
                                    catagories.append(
                                        infoJson["game_category_ref"]["name"])
                                elif "title" in infoJson["game_category_ref"]:
                                    catagories.append(
                                        infoJson["game_category_ref"]["title"])
                                else:
                                    try:
                                        for game_category in infoJson[
                                                "game_category_ref"]:
                                            catagories.append(
                                                infoJson["game_category_ref"]
                                                [catindex]["name"])
                                            catindex += 1
                                    except:
                                        pass
                                self.category = catagories

                            esrbcontent = []
                            if "esrb_content_descriptor_ref" in infoJson:
                                esrbindex = 0
                                if "name" in infoJson[
                                        "esrb_content_descriptor_ref"]:
                                    esrbcontent.append(
                                        infoJson["esrb_content_descriptor_ref"]
                                        ["name"])
                                elif "title" in infoJson[
                                        "esrb_content_descriptor_ref"]:
                                    esrbcontent.append(
                                        infoJson["esrb_content_descriptor_ref"]
                                        ["title"])
                                else:
                                    try:
                                        for descriptor in infoJson[
                                                "esrb_content_descriptor_ref"]:
                                            if 'name' in descriptor:
                                                esrbcontent.append(
                                                    descriptor["name"])
                                            if 'title' in descriptor:
                                                esrbcontent.append(
                                                    descriptor["title"])
                                    except:
                                        pass
                                self.ratingContent = esrbcontent

                            if "number_of_players" in infoJson:
                                self.numberOfPlayers = re.sub(
                                    '[^0-9]', '',
                                    infoJson["number_of_players"])

                            if "esrb_rating_ref" in infoJson:
                                if "esrb_rating" in infoJson[
                                        "esrb_rating_ref"]:
                                    if "short_description" in infoJson[
                                            "esrb_rating_ref"]["esrb_rating"]:
                                        self.rating = infoJson[
                                            "esrb_rating_ref"]["esrb_rating"][
                                                "short_description"]
                            '''
							if not self.screenshots:
								try:
									ss = []
									for s in infoJson["screenshot_gallery_ref"]["screenshot_gallery"]["screenshots"]:
										ss.append(s['image']['large_image']['include']['src'].replace('cocoon:/', ''))
									self.screenshots = ss
								except:
									pass
							'''

                            if "developer_ref" in infoJson:
                                if "name" in infoJson["developer_ref"]:
                                    self.developer = infoJson["developer_ref"][
                                        "name"]

                            if "publisher_ref" in infoJson:
                                if "name" in infoJson["publisher_ref"]:
                                    self.publisher = infoJson["publisher_ref"][
                                        "name"]
                                if 'title' in infoJson["publisher_ref"]:
                                    self.publisher = infoJson["publisher_ref"][
                                        "title"]

                            if "front_box_art" in infoJson:
                                if "image" in infoJson["front_box_art"]:
                                    if "image" in infoJson["front_box_art"][
                                            "image"]:
                                        if "url" in infoJson["front_box_art"][
                                                "image"]["image"]:
                                            self.frontBoxArt = infoJson[
                                                "front_box_art"]["image"][
                                                    "image"]["url"]

                            if "intro" in infoJson:
                                try:
                                    details = BeautifulSoup(
                                        infoJson["intro"][0], "html.parser")
                                    try:
                                        details = details.decode(
                                            formatter=None)
                                    except:
                                        details = details.decode()
                                    details = re.sub('<[^<]+?>', '',
                                                     details).strip()
                                    details = re.sub(' +', ' ', details)
                                    details = re.sub('\n ', '\n', details)
                                    details = re.sub('\n\n+', '\n\n', details)
                                    details = re.sub('(?<!\n)\n(?!\n)', ' ',
                                                     details)
                                    details = re.sub('  ', ' ', details)
                                    self.intro = details
                                except Exception as e:
                                    pass

                            if "game_overview_description" in infoJson:
                                details = BeautifulSoup(
                                    infoJson["game_overview_description"][0],
                                    "html.parser")
                                try:
                                    details = details.decode(formatter=None)
                                except:
                                    details = details.decode()
                                details = re.sub('<[^<]+?>', '',
                                                 details).strip()
                                details = re.sub(' +', ' ', details)
                                details = re.sub('\n ', '\n', details)
                                details = re.sub('\n\n+', '\n\n', details)
                                details = re.sub('(?<!\n)\n(?!\n)', ' ',
                                                 details)
                                details = re.sub('  ', ' ', details)
                                self.description = details
                #else:
                #f = open("missing.txt", 'a', encoding="utf8")
                #f.write(rid+"|title doesn't exist at ec.nintendo.com"+'\n')
                #f.close()

        except BaseException as e:
            pass
            print(repr(e) + ' ' + self.id)

        self.bannerFile()
        self.frontBoxArtFile()
        self.iconFile()
        self.screenshotFiles()
 import time
 print("\n\nSelenium library is present!\n\n ")
 def render_page(url):
     driver = webdriver.Chrome()
     # driver can be manually pointed to as well
     #driver = webdriver.Chrome(r"C:\Users\elili\AppData\Local\Microsoft\WindowsApps\chromedriver.exe")
     driver.get(url)
     time.sleep(3)
     r = driver.page_source
     #driver.quit()
     return r
 
 'REGEX for all EMAILS'
 r = render_page(url)
 soup_r = BeautifulSoup(r, "html.parser")
 emails = re.findall(r'[\w\.-]+@[\w\.-]+', soup_r.decode('utf-8'))
 
 'RULE, count social media Open Graph tags, "og", checking for two most common meta tags'
 og_t = soup.find_all("meta",  property="og:title")
 og_u = soup.find_all("meta",  property="og:url")
 social_count = len(og_t) + len(og_u)
 
 
 'RULE, determine presence of contact data'
 mailto_check = []
 if "mailto" in soup_r.decode('utf-8'):
     mailto_check = 1 
     
 'RULE, validate agreement of email domains against site domain, list foreign domain emails'
 consistent_emails = []
 suspect_emails = []
Пример #46
0
    def WriteHtml(self, html_template, useAbsolutePaths, filename):
        info = None
        try:
            PixivHelper.makeSubdirs(filename)
            info = codecs.open(filename, 'wb', encoding='utf-8')
        except IOError:
            info = codecs.open(str(self.imageId) + ".html",
                               'wb',
                               encoding='utf-8')
            PixivHelper.get_logger().exception(
                "Error when saving article html: %s, file is saved to: %s.html",
                filename, self.imageId)

        cover_image = ""
        if self.coverImageUrl:
            cover_image = f'<div class="cover"><img src="{self.coverImageUrl}"/></div>'
        page = html_template.replace("%coverImage%", cover_image)
        page = page.replace("%coverImageUrl%", self.coverImageUrl or "")
        page = page.replace("%artistName%", self.parent.artistName)
        page = page.replace("%imageTitle%", self.imageTitle)
        page = page.replace("%worksDate%", self.worksDate)

        token_body_text = ""
        token_images = ""
        token_text = ""
        if self.type == "article":
            token_body_text = f'<div class="article caption">{self.body_text}</div>'
        else:
            token_images = '<div class="non-article images">{0}</div>'.format(
                "".join([
                    '<a href="{0}">{1}</a>'.format(
                        x,
                        f'<img scr="{0}"/>' if x[x.rindex(".") + 1:].lower()
                        in ["jpg", "jpeg", "png", "bmp", "gif"] else x)
                    for x in self.images
                ]))
            token_text = '<div class="non-article caption">{0}</div>'.format(
                "".join([
                    '<p>{0}</p>'.format(x.rstrip())
                    for x in self.body_text.split("\n")
                ]))

        page = page.replace("%body_text(article)%", token_body_text)
        page = page.replace("%images(non-article)%", token_images)
        page = page.replace("%text(non-article)%", token_text)

        page = BeautifulSoup(page, features="html5lib")
        imageATags = page.find_all("a", attrs={"href": True})
        for imageATag in imageATags:
            tag = imageATag.img
            if tag:
                tag["src"] = imageATag["href"]
        root = page.find("div", attrs={"class": "main"})
        if root:
            root["class"].append(
                "non-article" if self.type != "article" else "article")
        page = page.decode()
        html_dir = os.path.dirname(filename)
        for k, v in self.linkToFile.items():
            if not useAbsolutePaths:
                try:
                    v = os.path.relpath(v, html_dir)
                except ValueError:
                    PixivHelper.get_logger().exception(
                        "Error when converting local paths to relative ones, absolute paths are used",
                        filename, self.imageId)
                    v = "file://" + v
            else:
                v = "file://" + v
            page = page.replace(k, v)
        info.write(page)
        info.close()
Пример #47
0
            for tag in new_soup.find_all('tr'):
                th = tag.th.text.encode('utf-8')
                parsed = BeautifulSoup(
                    tag.encode('utf-8'), parse_only=SoupStrainer(
                        'td')).find_all('td')[0].text.encode('utf-8').strip()
                '''if b'date:' in th.lower():
					date = parsed.replace(b'\n',b'').replace(b'\r',b'').decode('utf-8')
				elif b'time:' in th.lower():
					time = parsed.replace(b'\n',b'').replace(b'\r',b'').decode('utf-8')
				el'''
                if b'location:' in th.lower():
                    location = parsed.replace(b'\n',
                                              b'').replace(b'\r',
                                                           b'').decode('utf-8')
                elif b'\xc2' in th.lower():
                    description = parsed.decode('utf-8')
            if any(s in description.lower() for s in strings):
                print(title + " on " + date + " at " + time + " at " +
                      location)
                print(description)
                print(str(date + " at " + time))
                start, end = parse(str(date + " at " + time))
                print("Start = " + start)
                print("End = " + end)
                data = {
                    "Date": date,
                    "Time": time,
                    "Description": description,
                    "Location": location,
                    "Title": title
                }
Пример #48
0
    def _get_new_data(self, url, soup):  # 得到数据
        if soup.find('div', class_="main-content").find('h1') is not None:
            self.view_datas["view_name"] = soup.find(
                'div', class_="main-content").find('h1').get_text()  # 景点名
            print(self.view_datas["view_name"])
        else:
            self.view_datas["view_name"] = soup.find(
                "div", class_="feature_poster").find("h1").get_text()
        self.view_datas["view_message"] = soup.find(
            'div', class_="lemma-summary").get_text()  # 简介
        self.view_datas["basic_message"] = soup.find(
            'div', class_="basic-info cmn-clearfix").get_text()  # 基本信息
        self.view_datas["basic_message"] = self.view_datas[
            "basic_message"].split("\n")
        get = []
        for line in self.view_datas["basic_message"]:
            if line != "":
                get.append(line)
        self.view_datas["basic_message"] = get
        i = 1
        get2 = []
        tmp = "%%"
        for line in self.view_datas["basic_message"]:
            if i % 2 == 1:
                tmp = line
            else:
                a = tmp + ":" + line
                get2.append(a)
            i = i + 1
        self.view_datas["basic_message"] = get2
        self.view_datas["catalog"] = soup.find(
            'div', class_="lemma-catalog").get_text().split("\n")  #目录整体
        get = []
        for line in self.view_datas["catalog"]:
            if line != "":
                get.append(line)
        self.view_datas["catalog"] = get
        # 百科内容
        view_name = self.view_datas["view_name"]
        html = urllib.request.urlopen(url)
        soup2 = BeautifulSoup(html.read(), 'html.parser').decode('utf-8')
        p = re.compile(r'<div class="para-title level-2"', re.DOTALL)
        r = p.search(soup2)
        content_data_node = soup2[r.span(0)[0]:]  # 第一个h2(头)
        p = p = re.compile(r'<div class="album-list">', re.DOTALL)  # 尾
        r = p.search(content_data_node)
        content_data = content_data_node[0:r.span(0)[0]]
        lists = content_data.split('<div class="para-title level-2">')
        i = 1
        for list in lists:  # 每一大块
            final_soup = BeautifulSoup(list, "html.parser")
            name_list = None
            try:
                part_name = final_soup.find(
                    'h2',
                    class_="title-text").get_text().replace(view_name,
                                                            '').strip()
                part_data = final_soup.get_text().replace(
                    view_name, '').replace(part_name, '').replace('编辑',
                                                                  '')  # 历史沿革
                name_list = final_soup.findAll('h3', class_="title-text")
                all_name_list = {}
                na = "part_name" + str(i)
                all_name_list[na] = part_name
                final_name_list = []  ###########
                for nlist in name_list:
                    nlist = nlist.get_text().replace(view_name, '').strip()
                    final_name_list.append(nlist)
                fin = "final_name_list" + str(i)
                all_name_list[fin] = final_name_list
                print(all_name_list)
                i = i + 1
                # 正文
                try:
                    p = re.compile(r'<div class="para-title level-3">',
                                   re.DOTALL)
                    final_soup = final_soup.decode('utf-8')
                    r = p.search(final_soup)
                    final_part_data = final_soup[r.span(0)[0]:]
                    part_lists = final_part_data.split(
                        '<div class="para-title level-3">')
                    for part_list in part_lists:
                        final_part_soup = BeautifulSoup(
                            part_list, "html.parser")
                        content_lists = final_part_soup.findAll("div",
                                                                class_="para")
                        for content_list in content_lists:  # 每个最小段
                            try:
                                pic_word = content_list.find(
                                    "div",
                                    class_="lemma-picture text-pic layout-right"
                                ).get_text()  # 去掉文字中的图片描述
                                try:
                                    pic_word2 = content_list.find(
                                        "div", class_="description").get_text(
                                        )  # 去掉文字中的图片描述
                                    content_list = content_list.get_text(
                                    ).replace(pic_word,
                                              '').replace(pic_word2, '')
                                except:
                                    content_list = content_list.get_text(
                                    ).replace(pic_word, '')

                            except:
                                try:
                                    pic_word2 = content_list.find(
                                        "div", class_="description").get_text(
                                        )  # 去掉文字中的图片描述
                                    content_list = content_list.get_text(
                                    ).replace(pic_word2, '')
                                except:
                                    content_list = content_list.get_text()
                            r_part = re.compile(r'\[\d.\]|\[\d\]')
                            part_result, number = re.subn(
                                r_part, "", content_list)
                            part_result = "".join(part_result.split())
                            #print(part_result)
                except:
                    final_part_soup = BeautifulSoup(list, "html.parser")
                    content_lists = final_part_soup.findAll("div",
                                                            class_="para")
                    for content_list in content_lists:
                        try:
                            pic_word = content_list.find(
                                "div",
                                class_="lemma-picture text-pic layout-right"
                            ).get_text()  # 去掉文字中的图片描述
                            try:
                                pic_word2 = content_list.find(
                                    "div", class_="description").get_text(
                                    )  # 去掉文字中的图片描述
                                content_list = content_list.get_text().replace(
                                    pic_word, '').replace(pic_word2, '')
                            except:
                                content_list = content_list.get_text().replace(
                                    pic_word, '')

                        except:
                            try:
                                pic_word2 = content_list.find(
                                    "div", class_="description").get_text(
                                    )  # 去掉文字中的图片描述
                                content_list = content_list.get_text().replace(
                                    pic_word2, '')
                            except:
                                content_list = content_list.get_text()
                        r_part = re.compile(r'\[\d.\]|\[\d\]')
                        part_result, number = re.subn(r_part, "", content_list)
                        part_result = "".join(part_result.split())
                        #print(part_result)

            except:
                print("error")
        return
Пример #49
0
    def clean_content(self):
        content = self.cleaned_data.get('content')
        from bs4 import BeautifulSoup
        legal_tag_dict = {
            'font': ['color', 'size', 'face', '.background-color'],
            'span': [
                '.color', '.background-color', '.font-size', '.font-family',
                '.background', '.font-weight', '.font-style',
                '.text-decoration', '.vertical-align', '.line-height'
            ],
            'div': [
                'align', '.border', '.margin', '.padding', '.text-align',
                '.color', '.background-color', '.font-size', '.font-family',
                '.font-weight', '.background', '.font-style',
                '.text-decoration', '.vertical-align', '.margin-left'
            ],
            'table': [
                'border', 'cellspacing', 'cellpadding', 'width', 'height',
                'align', 'bordercolor', '.padding', '.margin', '.border',
                'bgcolor', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.font-weight', '.font-style',
                '.text-decoration', '.background', '.width', '.height',
                '.border-collapse'
            ],
            'td': [
                'align', 'valign', 'width', 'height', 'colspan', 'rowspan',
                'bgcolor', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.font-weight', '.font-style',
                '.text-decoration', '.vertical-align', '.background', '.border'
            ],
            'th': [
                'align', 'valign', 'width', 'height', 'colspan', 'rowspan',
                'bgcolor', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.font-weight', '.font-style',
                '.text-decoration', '.vertical-align', '.background', '.border'
            ],
            'a': ['href', 'target', 'name'],
            'embed': [
                'src', 'width', 'height', 'type', 'loop', 'autostart',
                'quality', '.width', '.height', 'align', 'allowscriptaccess'
            ],
            'img': [
                'src', 'width', 'height', 'border', 'alt', 'title', 'align',
                '.width', '.height', '.border'
            ],
            'p': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'ol': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'ul': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'li': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'blockquote': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'h1': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'h2': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'h3': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'h4': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'h5': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'h6': [
                'align', '.text-align', '.color', '.background-color',
                '.font-size', '.font-family', '.background', '.font-weight',
                '.font-style', '.text-decoration', '.vertical-align',
                '.text-indent', '.margin-left'
            ],
            'pre': ['class'],
            'hr': ['class', '.page-break-after'],
            'br': [],
            'tbody': [],
            'tr': [],
            'strong': [],
            'b': [],
            'sub': [],
            'em': [],
            'i': [],
            'u': [],
            'strike': [],
            's': [],
            'del': [],
        }
        soup = BeautifulSoup(content, 'html.parser')
        tag_list = soup.find_all()

        for tag in tag_list:
            if tag.name not in legal_tag_dict:
                tag.decompose()
            else:
                l = []
                if tag.attrs:
                    print(123)
                    for attr in tag.attrs:
                        print(attr)
                        if attr not in legal_tag_dict[tag.name]:
                            l.append(attr)
                for i in l:
                    del tag.attrs[i]

        return soup.decode()
Пример #50
0
def clean(content):
    valid_tags = {
        'font': ['color', 'size', 'face', '.background-color'],
        'span': [
            '.color', '.background-color', '.font-size', '.font-family',
            '.background', '.font-weight', '.font-style', '.text-decoration',
            '.vertical-align', '.line-height'
        ],
        'div': [
            'align', '.border', '.margin', '.padding', '.text-align', '.color',
            '.background-color', '.font-size', '.font-family', '.font-weight',
            '.background', '.font-style', '.text-decoration',
            '.vertical-align', '.margin-left'
        ],
        'table': [
            'border', 'cellspacing', 'cellpadding', 'width', 'height', 'align',
            'bordercolor', '.padding', '.margin', '.border', 'bgcolor',
            '.text-align', '.color', '.background-color', '.font-size',
            '.font-family', '.font-weight', '.font-style', '.text-decoration',
            '.background', '.width', '.height', '.border-collapse'
        ],
        'td': [
            'align', 'valign', 'width', 'height', 'colspan', 'rowspan',
            'bgcolor', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.font-weight', '.font-style',
            '.text-decoration', '.vertical-align', '.background', '.border'
        ],
        'th': [
            'align', 'valign', 'width', 'height', 'colspan', 'rowspan',
            'bgcolor', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.font-weight', '.font-style',
            '.text-decoration', '.vertical-align', '.background', '.border'
        ],
        'a': ['href', 'target', 'name'],
        'embed': [
            'src', 'width', 'height', 'type', 'loop', 'autostart', 'quality',
            '.width', '.height', 'align', 'allowscriptaccess'
        ],
        'img': [
            'src', 'width', 'height', 'border', 'alt', 'title', 'align',
            '.width', '.height', '.border'
        ],
        'p': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'ol': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'ul': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'li': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'blockquote': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'h1': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'h2': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'h3': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'h4': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'h5': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'h6': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        'pre': ['class'],
        'hr': ['class', '.page-break-after'],
        'br': [],
        'tbody': [],
        'tr,': [],
        'strong': [],
        'b,': [],
        'sub': [],
        'sup': [],
        'em': [],
        'i': [],
        'u': [],
        'strike': [],
        's': [],
        'del': []
    }
    soup = BeautifulSoup(content, 'html.parser')
    tags = soup.find_all()
    for tag in tags:
        if tag.name not in valid_tags:
            tag.decompose()
        if tag.name == 'img':
            tag['style'] = 'width:100%'
            tag.prettify()
        if tag.attr is not None:
            for k in tag.attr:
                if k not in tags[tag.name]:
                    del tag.attr[k]

    content = soup.decode()
    return content
Пример #51
0
def process(filepath):
    # print "in process"
    print(filepath)
    with open(filepath, 'rb') as f:
        # print "opened " + filepath
        l = filepath.split('/')
        name = ''
        if (l[len(l) - 2]) == 'build':
            name = l[len(l) - 1]
        s = f.read()
        #s = s.replace(find, replace)
        s = s.replace(b"index.html", b"")
        s = s.replace(b"<html>", b"<!DOCTYPE html lang=\"en\">")
        s = s.replace(
            b'<meta',
            b"<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"><meta"
        )
        soup = BeautifulSoup(s, "lxml")

        for i in soup.find_all("table", attrs={"summary":
                                               "Navigation header"}):
            i.contents[0].contents[0].clear()
            if name == "index.html":
                link = BeautifulSoup("<a href=\"ix01.html\">Index</a>", "lxml")
            elif name == "ix01.html":
                link = BeautifulSoup("", "lxml")
            else:
                link = BeautifulSoup("<a href=\"../ix01.html\">Index</a>",
                                     "lxml")
                i.contents[0].contents[0].insert(0, link)
            if name == "index.html":
                link = BeautifulSoup("", "lxml")
            elif name == "ix01.html":
                link = BeautifulSoup(
                    "<a href=\"index.html\">Table of Contents</a>", "lxml")
            else:
                link = BeautifulSoup("<a href=\"../\">Table of Contents</a>",
                                     "lxml")
                i.contents[1].contents[1].insert(0, link)
        soup = BeautifulSoup(soup.renderContents(), "lxml")
        for j in soup.findAll("table", attrs={"summary": "Navigation footer"}):
            if name == "index.html":
                link = BeautifulSoup("<a href=\"ix01.html\">Index</a>", "lxml")
            elif name == "ix01.html":
                link = BeautifulSoup("", "lxml")
            else:
                link = BeautifulSoup("<a href=\"../ix01.html\">Index</a>",
                                     "lxml")
            j.contents[0].contents[1].insert(0, link)
            if name == "ix01.html":
                link = BeautifulSoup(
                    "<a href=\"index.html\">Table of Contents</a>", "lxml")
            if name == "index.html":
                link = BeautifulSoup("", "lxml")
            elif name == "ix01.html":
                link = BeautifulSoup(
                    "<a href=\"index.html\">Table of Contents</a>", "lxml")
            else:
                link = BeautifulSoup("<a href=\"../\">Table of Contents</a>",
                                     "lxml")
            #j.contents[0].contents[1].insert(0, link)
            j.contents[1].contents[1].clear()
            j.contents[1].contents[1].insert(0, link)
            # Now mathjax removed
        # p = BeautifulSoup("<h3><a href='/'>Site Home</a></h3><p class='alert alert-danger'>Please see <a href=\"http://caniuse.com/#feat=mathml\">http://caniuse.com/#feat=mathml</a> if your browser supports MathML because certain sections of this book rely on MathML. If your browser does not support MathML please install Firefox from <a href=\"https://www.mozilla.org\">Mozilla</a> because AFAIK Firefox supports MathML. On other browsers Mathjax will take its sweet time to render page.</p>", "lxml")
        #soup.body.insert(0, p)
        soup = BeautifulSoup(soup.renderContents(), "lxml")
        for i in soup.find_all("pre", attrs={"class": "CLexer"}):
            code = BeautifulSoup(
                highlight(i.string, CLexer(), HtmlFormatter()), "lxml")

            i.string.replace_with(code)
        for i in soup.find_all("span", attrs={"class": "mathphrase"}):
            math = BeautifulSoup(render_latex(i.string), "lxml")
            i.string.replace_with(math)
        with open(filepath, "w") as f:
            f.write(soup.decode(formatter='html'))
Пример #52
0
valid_tag = {'p': ['class', 'id'], 'img': 'src', 'div': 'class'}

#Tag.decompose() 方法将当前节点移除文档树并完全销毁:

#找到所有的标签名
tags = soup.find_all()

for tag in tags:
    # print('tag--------',tag)
    if tag.name in valid_tag:
        # print(tag.attrs)    取属性
        tag.decompose()
    if tag.attrs:  #是否有属性
        for k in list(tag.attrs.keys()):  #{id:'i1',a=123,b=999}
            if k not in valid_tag[tag.name]:
                del tag.attrs[k]

content_str = soup.decode()
print(content_str)

# v = soup.find(name='p',attrs={'id':'i2'})
# print(v)

# tag = soup.find(name='p')
# sc = tag.find('script')
# print(sc)

# tag = soup.find(name='p')
# sc = tag.find('script')
# print(sc)
Пример #53
0
#print(title)
#print(title[0])
#abstract=re.findall('<div id="abstract">\n\s*(.*?)\n\s*</div>',page.decode('utf-8'),re.S)
#print(abstract)
cnt = 0
for link in page.find_all("dt", class_="ptitle"):
    uuu = link.find('a')
    url = str(uuu.get('href'))
    #print(type(url))
    urll = "http://openaccess.thecvf.com/"
    urll = urll + url
    print(urll)
    file1 = urllib.request.urlopen(urll).read()
    page1 = BeautifulSoup(file1, "html.parser")
    title = re.findall('<div id="papertitle">\n\s*(.*?)\n\s*</div>',
                       page1.decode('utf-8'), re.S)
    abstract = re.findall('<div id="abstract">\n\s*(.*?)\n\s*</div>',
                          page1.decode('utf-8'), re.S)
    if cnt != 0:
        f.write('\n')
        f.write('\n')
    f.write(str(cnt))
    f.write('\n')

    f.write("Title: " + title[0])
    f.write('\n')
    f.write("Abstract: " + abstract[0])

    f.write('\n')

    cnt = cnt + 1
Пример #54
0
    def parse(self, response):
        item = WikiItem()
        title = response.xpath(
            '//h1[@id="firstHeading"]/text()').extract_first()
        item['title'] = title
        item['url'] = response.url
        # tr_list = response.xpath('//table[@class="infobox vcard"]/tr')
        tr_list = response.css('.infobox tr')
        image = tr_list.xpath('//a[@class="image"]/img/@src').extract_first()
        if image is not None:
            item['image'] = "https:" + image

        r_part = re.compile(r'\[\d.\]|\[\d\]')

        # 右侧的info_box表格
        info_box = []
        for tr in tr_list:
            th = tr.xpath('./th[@scope="row"]//text()').extract_first()
            if th is not None:
                td = re.sub(r_part, "",
                            "".join(tr.xpath('./td//text()').extract()))
                info_box.append({'key': th, 'value': stripTagSimple(td)})
        print(info_box)
        # print(title)

        pic = []
        thumb_tright = response.xpath(
            '//div[@class="thumb tright"]/div[@class="thumbinner"]')
        for p in thumb_tright:
            if p.xpath('./a/img/@src').extract_first() is not None:
                img = 'https:' + p.xpath('./a/img/@src').extract_first()
                img_desc = re.sub(
                    r_part, "", "".join(
                        p.xpath(
                            './div[@class="thumbcaption"]//text()').extract()))
                pic.append({'url': img, 'img_desc': stripTagSimple(img_desc)})
        # print(pic)
        item['pic'] = pic

        html_content = response.xpath(
            '//div[@id="mw-content-text"]').extract_first()
        soup = BeautifulSoup(html_content, 'html.parser')
        # 销毁目录节点
        catalog = soup.find('div', class_="toc")
        if catalog is not None:
            soup.find('div', class_="toc").decompose()
        # 销毁参考资料节点
        ref = soup.find('ol', class_="references")
        if ref is not None:
            soup.find('ol', class_="references").decompose()

        # ps是文中所有的段落
        div = soup.find(name='div', class_='mw-parser-output')
        ps = div.find_all('p', recursive=False)  # only direct children
        index = 0
        for p in ps:
            if p.get_text() == '':
                break
            index += 1
        summary = {}
        s_index = 0
        while s_index < index:
            summary[f'{s_index}'] = stripTagSimple(ps[s_index].get_text())
            s_index += 1
        print(summary)

        start = re.compile(r'<p>', re.DOTALL)
        search_result = start.search(soup.decode('utf-8'))
        if search_result is None:
            search_result = re.compile(r'<h2>',
                                       re.DOTALL).search(soup.decode('utf-8'))
        content_text = collections.OrderedDict()
        if search_result is not None:
            start_node = soup.decode('utf-8')[search_result.start():]
            lists = start_node.split('<h2>')

            i = 1
            while i < len(lists):
                lists[i] = '<h2>' + lists[i]
                final_soup = BeautifulSoup(lists[i], 'html.parser')
                para_title = final_soup.find(
                    'span', class_="mw-headline").get_text().strip()
                if para_title == "外部链接" or "参考" in para_title:
                    i += 1
                    continue
                para_contents = final_soup.find_all(['p', 'li', 'table'])
                texts = []
                for para in para_contents:
                    if para.name == 'table':
                        texts.append(para.prettify())
                        continue
                    texts.append(stripTagSimple(para.get_text('', True)))
                content_text[para_title.replace('.', '点')] = texts
                i += 1
            catlinks = response.xpath(
                '//div[@class="catlinks"]/div[@id="mw-normal-catlinks"]//li')

            tag = {}
            j = 0
            for link in catlinks:
                href = 'https://zh.wikipedia.org' + link.xpath(
                    './a/@href').extract_first()
                cat = link.xpath('./a/text()').extract_first()
                tag[f'{j}'] = cat
                j += 1

            detail = {
                'title': title,
                'summary': summary,
                'infobox': info_box,
                'content': content_text,
                'category': tag,
            }
            item['detail'] = detail
            now_time = datetime.datetime.fromtimestamp(time.time())
            item['updateAt'] = now_time
            return item
Пример #55
0
def get_latest_release(distro, release):
    # Instantiate ImageObject
    image = ImageObject()
    if distro == 'ubuntu':
        # Define URL & other variables (defines all because of clarity)
        url = 'https://cloud-images.ubuntu.com/releases/{rel}/'.format(rel=release, )
        hashfile_url = 'SHA256SUMS'
        image_hash = None
        image_name = None
        image_url = None
        latest_build = None
        image_suffix = None

        # Retrieve the latest image url
        match_list = []  # Define match_list for later use
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        compile_string = 'release-'
        builds = soup.find_all(href=re.compile(compile_string))
        for build in builds:
            match_list.append(build.string.replace('/', ''))  # Remove forward slash in the strings
        # Sort Alphanumericly
        sorted_builds = sorted(match_list, key=lambda item: (int(item.partition(' ')[0])
                                                             if item[0].isdigit() else float('inf'), item))
        latest_build = sorted_builds[-1]  # It is nicer to set a variable that is the last in the list
        if release <= '16.04':
            # We want to send back the image name
            image_name = 'ubuntu-{release}-server-cloudimg-amd64-disk1.img'.format(release=release)
            # We also want to send back the url that was used
            image_url = '{url}{build}/ubuntu-{release}-server-cloudimg-amd64-disk1.img'.format(url=url,
                                                                                               build=latest_build,
                                                                                               release=release)
            image_suffix = 'img'
        elif release >= '18.04':
            # We want to send back the image name
            image_name = 'ubuntu-{release}-server-cloudimg-amd64.img'.format(release=release)
            # We also want to send back the url that was used
            image_url = '{url}{build}/ubuntu-{release}-server-cloudimg-amd64.img'.format(url=url,
                                                                                         build=latest_build,
                                                                                         release=release)
            image_suffix = 'img'

        # Get sha256 to compare with database
        page = requests.get('{url}{build}/{hashfile}'.format(url=url, build=latest_build, hashfile=hashfile_url))
        soup = BeautifulSoup(page.text, 'html.parser')
        hash_list = soup.decode().split("\n")
        hash_list.pop(-1)
        for hash in hash_list:
            search_string = '{}$'.format(image_name)
            if re.search(search_string, hash):
                image_hash = hash.split(' ')[0]

        # v2: ImageObject
        image.name = image_name
        image.sha256 = image_hash
        image.source_url = image_url
        image.build = latest_build
        image.file_suffix = image_suffix

    elif distro == 'centos':
        # Define URL & other variables (defines all because of clarity)
        url = 'http://cloud.centos.org/centos/{rel}/images/'.format(rel=release)
        hashfile_url = 'sha256sum.txt'
        image_name = ""
        image_hash = ""
        image_url = ""
        latest_build = ""

        # Retrieve the latest image url
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        builds = soup.find_all(href=re.compile('CentOS-[0-9]-x86_64-GenericCloud-[0-9]{4}.qcow2'))

        match_list = []
        for build in builds:
            pattern1 = re.compile(r'(CentOS-[0-9]-x86_64-GenericCloud-[0-9]{4}.qcow2c)|CentOS-[0-9]-x86_64-GenericCloud-[0-9]{4}.qcow2.xz')
            match_all_other_files = pattern1.match(build.contents[0])

            if match_all_other_files:
                continue
                #print('DELETE {}'.format(match_all_other_files.group(0)))
            else:
                match_list.append(build.contents[0])
        # Sort Alphanumericly
        sorted_builds = sorted(match_list, key=lambda item: (int(item.partition(' ')[0])
                                                            if item[0].isdigit() else float('inf'), item))
        # It is nicer to set a variable that is the last in the list
        latest_build = sorted_builds[-1].split('-')[4].replace('.qcow2', '')
        image_url = '{url}CentOS-{release}-x86_64-GenericCloud-{build}.qcow2'.format(url=url,
                                                                                     release=release,
                                                                                     build=latest_build)
        image_suffix = 'qcow2'
        # Get sha256 to compare with database
        page = requests.get('{url}{hashfile}'.format(url=url, hashfile=hashfile_url))
        soup = BeautifulSoup(page.text, 'html.parser')
        hash_list = soup.decode().split("\n")
        hash_list.pop(-1)
        for hash in hash_list:
            search_string = 'CentOS-{release}-x86_64-GenericCloud-{build}.qcow2$'.format(release=release, build=latest_build)
            if re.search(search_string, hash):
                image_hash = hash.split('  ')[0]
                image_name = hash.split('  ')[1]

        # v2: ImageObject
        image.name = image_name
        image.sha256 = image_hash
        image.source_url = image_url
        image.build = latest_build
        image.file_suffix = image_suffix

    return image
Пример #56
0
import urllib2
from bs4 import BeautifulSoup

import zlib

# f=urllib2.urlopen(url)

quote_page = 'http://www.emmacloth.com/Tassel-Trim-Dolphin-Hem-Striped-Tee-Dress-p-356028-cat-1727.html'
page = urllib2.urlopen(quote_page)
decompressed_data = zlib.decompress(page.read(), 16 + zlib.MAX_WBITS)
#print decompressed_data

soup = BeautifulSoup(page, 'html.parser')
print soup.decode('gzip')

json_string = soup.find_all('script')
print "The json string  = ", json_string
Пример #57
0
def test_rddd001_initial_state(dash_duo):
    app = dash.Dash(__name__)
    my_class_attrs = {
        "id": "p.c.4",
        "className": "my-class",
        "title": "tooltip",
        "style": {
            "color": "red",
            "fontSize": 30
        },
    }
    # fmt:off
    app.layout = html.Div([
        'Basic string', 3.14, True, None,
        html.Div('Child div with basic string', **my_class_attrs),
        html.Div(id='p.c.5'),
        html.Div([
            html.Div('Grandchild div', id='p.c.6.p.c.0'),
            html.Div([
                html.Div('Great grandchild', id='p.c.6.p.c.1.p.c.0'), 3.14159,
                'another basic string'
            ],
                     id='p.c.6.p.c.1'),
            html.Div([
                html.Div(html.Div([
                    html.Div([
                        html.Div(id='p.c.6.p.c.2.p.c.0.p.c.p.c.0.p.c.0'), '',
                        html.Div(id='p.c.6.p.c.2.p.c.0.p.c.p.c.0.p.c.2')
                    ],
                             id='p.c.6.p.c.2.p.c.0.p.c.p.c.0')
                ],
                                  id='p.c.6.p.c.2.p.c.0.p.c'),
                         id='p.c.6.p.c.2.p.c.0')
            ],
                     id='p.c.6.p.c.2')
        ],
                 id='p.c.6')
    ])
    # fmt:on

    dash_duo.start_server(app)

    # Note: this .html file shows there's no undo/redo button by default
    with open(
            os.path.join(os.path.dirname(__file__),
                         "initial_state_dash_app_content.html")) as fp:
        expected_dom = BeautifulSoup(fp.read().strip(), "lxml")

    fetched_dom = dash_duo.dash_outerhtml_dom

    assert (fetched_dom.decode() == expected_dom.decode()
            ), "the fetching rendered dom is expected"

    assert dash_duo.get_logs(
    ) == [], "Check that no errors or warnings were displayed"

    assert dash_duo.driver.execute_script(
        "return JSON.parse(JSON.stringify(window.store.getState().layout))"
    ) == json.loads(json.dumps(app.layout, cls=plotly.utils.PlotlyJSONEncoder)
                    ), "the state layout is identical to app.layout"

    r = requests.get("{}/_dash-dependencies".format(dash_duo.server_url))
    assert r.status_code == 200
    assert r.json(
    ) == [], "no dependencies present in app as no callbacks are defined"

    paths = dash_duo.redux_state_paths
    assert paths["objs"] == {}
    assert paths["strs"] == {
        abbr: [
            int(token) if token in string.digits else token.replace(
                "p", "props").replace("c", "children")
            for token in abbr.split(".")
        ]
        for abbr in (child.get("id") for child in fetched_dom.find(
            id="react-entry-point").findChildren(id=True))
    }, "paths should reflect to the component hierarchy"

    assert dash_duo.redux_state_rqs == [], "no callback => no pendingCallbacks"

    dash_duo.percy_snapshot(name="layout")
    assert dash_duo.get_logs() == [], "console has no errors"
Пример #58
0
    async def generate_page(self):
        soup = BeautifulSoup(self.xml, 'lxml')

        for tag in soup.find_all(recursive=True):
            try:
                # add linebreak after certain tags
                if tag.name in TELEGRAPH_TAGS_INSERT_BR_AFTER:
                    tag.insert_after(soup.new_tag('br'))

                # remove tags that are not allowed in <li>
                if tag.name == 'li':
                    disallowed_tags = tag.find_all(
                        TELEGRAPH_DISALLOWED_TAGS_IN_LI, recursive=True)
                    for disallowed_tag in disallowed_tags:
                        disallowed_tag.replaceWithChildren()

                # deal with tags itself
                if tag.name in TELEGRAPH_DEL_TAGS:
                    if tag.name == 'table':
                        rows = tag.find_all('tr')
                        if not rows:
                            tag.decompose()
                            continue
                        for row in rows:
                            columns = row.find_all(('td', 'th'))
                            if len(columns) != 1:
                                if env.TABLE_TO_IMAGE:
                                    table_img = await convert_table_to_png(
                                        str(tag))
                                    if table_img:
                                        url_l = await apis.get_account(
                                        ).upload(BytesIO(table_img),
                                                 full=False)
                                        url = url_l[0] if url_l else None
                                        if url:
                                            tag.replaceWith(
                                                soup.new_tag('img', src=url))
                                            continue
                                tag.decompose()
                                continue
                        tag.replaceWithChildren()
                    else:
                        tag.decompose()
                    continue
                elif tag.name in TELEGRAPH_REPLACE_TAGS:
                    old_name = tag.name
                    new_name = TELEGRAPH_REPLACE_TAGS[old_name]
                    tag.name = new_name
                    if old_name.startswith('h') and not new_name.startswith(
                            'h') and new_name != 'p':
                        # ensure take a whole line
                        tag.insert_before(soup.new_tag('br')) \
                            if (hasattr(tag.previous_sibling, 'name')
                                and tag.previous_sibling.name not in {'br', 'p'}
                                and not tag.previous_sibling.name.startswith('h')) \
                            else None
                        tag.insert_after(soup.new_tag('br'))
                elif tag.name not in TELEGRAPH_ALLOWED_TAGS:
                    tag.replaceWithChildren()  # remove disallowed tags
                    continue

                # verify tags
                if tag.name == 'a' and not tag.text:
                    tag.replaceWithChildren()  # remove invalid links
                    continue
                elif tag.name == 'img' and is_emoticon(tag):
                    alt = tag.get('alt')
                    tag.replaceWith(
                        alt) if alt else tag.decompose()  # drop emoticon
                    continue

                # deal with attributes
                if tag.name not in TELEGRAPH_TAGS_ALLOW_ATTR:
                    tag.attrs = {}  # remove all attributes
                    continue
                else:
                    attr_name = TELEGRAPH_TAGS_ALLOW_ATTR[tag.name]
                    attr_content = tag.attrs.get(attr_name)
                    if not attr_content:
                        tag.replaceWithChildren()
                        continue
                    if self.link:
                        attr_content = resolve_relative_link(
                            self.link, attr_content)
                    if not isAbsoluteHttpLink(attr_content):
                        tag.replaceWithChildren()
                        continue
                    if tag.name in {
                            'video', 'img'
                    } and not attr_content.startswith(env.IMG_RELAY_SERVER):
                        attr_content = env.IMG_RELAY_SERVER + attr_content
                    tag.attrs = {attr_name: attr_content}
            except (ValueError, AttributeError):
                pass

        if self.feed_title:
            self.telegraph_author = f"{self.feed_title}"
            if self.author and self.author not in self.feed_title:
                self.telegraph_author += f' ({self.author})'
            self.telegraph_author_url = self.link if self.link else ''
        else:
            self.telegraph_author = 'Generated by RSStT'
            self.telegraph_author_url = 'https://github.com/Rongronggg9/RSS-to-Telegram-Bot'

        self.telegraph_title = self.title if self.title else 'Generated by RSStT'
        self.telegraph_html_content = (
            soup.decode() + '<p>Generated by '
            '<a href="https://github.com/Rongronggg9/RSS-to-Telegram-Bot">RSStT</a>. '
            'The copyright belongs to the original author.</p>'
            # "If images cannot be loaded properly due to anti-hotlinking, "
            # "please consider install "
            # "<a href='https://greasyfork.org/scripts/432923'>this userscript</a>."
            +
            (f'<p><a href="{self.link}">Source</a></p>' if self.link else ''))
Пример #59
0
def filter_xss(html_str):
    # valid_tag_list = ["p", "div", "a", "img", "html", "body", "br", "strong", "b"]

    valid_dict = {
        "font": ['color', 'size', 'face', '.background-color'],
        "span": [
            '.color', '.background-color', '.font-size', '.font-family',
            '.background', '.font-weight', '.font-style', '.text-decoration',
            '.vertical-align', '.line-height'
        ],
        "div": [
            'align', '.border', '.margin', '.padding', '.text-align', '.color',
            '.background-color', '.font-size', '.font-family', '.font-weight',
            '.background', '.font-style', '.text-decoration',
            '.vertical-align', '.margin-left'
        ],
        "table": [
            'border', 'cellspacing', 'cellpadding', 'width', 'height', 'align',
            'bordercolor', '.padding', '.margin', '.border', 'bgcolor',
            '.text-align', '.color', '.background-color', '.font-size',
            '.font-family', '.font-weight', '.font-style', '.text-decoration',
            '.background', '.width', '.height', '.border-collapse'
        ],
        'td,th': [
            'align', 'valign', 'width', 'height', 'colspan', 'rowspan',
            'bgcolor', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.font-weight', '.font-style',
            '.text-decoration', '.vertical-align', '.background', '.border'
        ],
        "a": ['href', 'target', 'name'],
        "embed": [
            'src', 'width', 'height', 'type', 'loop', 'autostart', 'quality',
            '.width', '.height', 'align', 'allowscriptaccess'
        ],
        "img": [
            'src', 'width', 'height', 'border', 'alt', 'title', 'align',
            '.width', '.height', '.border'
        ],
        'p,ol,ul,li,blockquote,h1,h2,h3,h4,h5,h6': [
            'align', '.text-align', '.color', '.background-color',
            '.font-size', '.font-family', '.background', '.font-weight',
            '.font-style', '.text-decoration', '.vertical-align',
            '.text-indent', '.margin-left'
        ],
        "pre": ['class'],
        "hr": ['class', '.page-break-after'],
        'br,tbody,tr,strong,b,sub,sup,em,i,u,strike,s,del': []
    }

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html_str, "html.parser")  # soup  ----->  document

    ######### 改成dict
    for ele in soup.find_all():
        # 过滤非法标签
        if ele.name not in valid_dict:
            ele.decompose()
        # 过滤非法属性

        else:
            attrs = ele.attrs  # p {"id":12,"class":"d1","egon":"dog"}
            l = []
            for k in attrs:
                if k not in valid_dict[ele.name]:
                    l.append(k)

            for i in l:
                del attrs[i]

    print('soup', soup)

    return soup.decode()
Пример #60
0
<span style="font-size: 8px">testspan</span>
<script>alter('123')</script>
'''

# 黑名单方式删除匹配的标签及属性
#从html格式解析content里面的内容
soup=BeautifulSoup(content,'html.parser')
# 查找content里面的script标签
tag=soup.find('script')
# 打印查找到的标签
print('打印匹配到的script标签',tag)
# 清空tag对象,也就是清空查找到的script标签里面的内容,但是script标签本身还是留下
tag.clear()
print('decode前',content)
# 将转码成字符串,执行后<script><script>里面的alter('123')被清除了。
content=soup.decode(content)
print('decode后,清空了script里面的内容',content)
#隐藏匹配到的<script>标签
tag.hidden=True
content=soup.decode(content)
print('隐藏了script标签',content)

span=soup.find('span')
# 找到span标签属性并以字典显示
print('找到span标签的属性',span.attrs)
del span.attrs['style']
print('删除掉span的sytle属性',span)

print(content)

# 标签白名单