Python strip_html_tags примеры, utils.strip_html_tags Python примеры использования

Пример #1

0

Показать файл

def create_article(title, content, owner_login, user_suggestion):
    if len(content) > MAX_ARTICLE_CONTENT_LENGTH or len(
            title) > MAX_TITLE_LENGTH:
        return False
    user = User.get(User.name == owner_login)
    if user_suggestion is None:
        owner = user
    else:
        owner = User.get(User.name == user_suggestion)
    articles_count = user.articles_count
    html_content = markdown.markdown(content,
                                     safe_mode='escape',
                                     extensions=[TocExtension(baselevel=3)])
    stripped_text = strip_html_tags(html_content)
    preview_text = stripped_text[:MAX_ARTICLE_PREVIEW_TEXT_LENGTH]
    if len(stripped_text) > MAX_ARTICLE_PREVIEW_TEXT_LENGTH:
        preview_text += '...'
    Article.create(title=html.escape(title),
                   content=html_content,
                   preview_text=preview_text,
                   owner=owner,
                   is_draft=user_suggestion is not None).save()
    if user_suggestion is None:
        User\
            .update({User.articles_count: articles_count + 1})\
            .where(User.id == owner)\
            .execute()
    return True

Пример #2

0

Показать файл

Файл: define.py Проект: shacknetisp/fourthevaz

def getibiblio(word):
    html = \
    urllib.request.urlopen(
    'http://www.ibiblio.org/webster/cgi-bin/headword_search.pl?query='
    + word.replace(' ', '+')).read().decode()
    s = re.sub(' +', ' ', utils.find_between(html, '<def>', '</def>'
    ).strip()) + '.'
    return utils.strip_html_tags(s)

Пример #3

0

Показать файл

Файл: topic_extraction.py Проект: federico-fiorini/thesis

def parse_post(post):
    # Extract text and strip html tags and links
    content = strip_html_tags(post['the_post']['rawContent'])
    links = extract_urls(content)

    for url in links:
        content = content.replace(url, '')

    try:
        images = list(
            map(lambda x: x['cdnUrl'], post['the_post']['entities']['images']))
    except KeyError:
        images = []

    return content, links, images

Пример #4

0

Показать файл

def broadcast_format_for_adn(feed, entry):

    #summary = clean_html(entry.summary)
    # Easy path for now, leave some space at the end for cleaning up of broken HTML
    #summary = ellipse_text(summary, 2000)
    # This will try and fix broken html
    #summary = html.tostring(html.fromstring(summary))

    link = format_link_for_entry(feed, entry, medium='App.net Broadcast')

    post = {
        'annotations': [metadata_annotation(entry), cross_post_annotation(link)]
    }

    post['annotations'] += common_annotations(entry)

    if feed.include_thumb and entry.thumbnail_image_url:
        post['annotations'].append(image_annotation_for_entry(entry))

    description = None
    if entry.meta_tags:
        og_description = entry.meta_tags.get('og', {}).get('description')
        twitter_description = entry.meta_tags.get('twitter', {}).get('description')
        description = og_description or twitter_description
        if description:
            # logger.info("What are we striping html from: %s %s", description, type(description))
            description = unicode(description)
            description = strip_html_tags(description)

    if description:
        post['text'] = description
    else:
        post['machine_only'] = True

    if feed.cross_post_to_defaults:
        post['publish_to'] = {'defaults': True}

    return post

Пример #5

0

Показать файл

Файл: cli.py Проект: nzcv/chtf

def main(options, profile):
    try:
        match_groups = parse_config.parse_file(os.path.join(options["main_dir_path"], "chtf-conf.yaml"))
    except parse_config.ConfigError as err:
        log.critical("{}.".format(err))
        utils.exit(1)

    if options["output_oneline"]:
        longest_name_len = len(max([match_group["name"] for match_group in match_groups], key=len))

    found_threads_urls_prev = []
    url_cache_file_path = os.path.join(options["cache_dir_path"], "prev_urls")
    with contextlib.suppress(FileNotFoundError):
        for line in utils.read_lines_from_file(url_cache_file_path):
            found_threads_urls_prev.append(line.strip())

    all_found_threads_amount = 0
    new_found_threads_amount = 0
    colour = TerminalColour(options["colour_output"])
    board_cache_dir_path = os.path.join(options["cache_dir_path"], "boards")
    found_threads_urls = []
    date_start = datetime.datetime.now()

    for match_group, threads in core.generate_threads(
        options["chan"], match_groups, board_cache_dir_path, options["core_dl_sleep_time"]
    ):
        if options["output_oneline"]:
            oneline_match_name = "{:{}} ".format(match_group["name"], longest_name_len)
            print("   {}".format(oneline_match_name), end="")
        else:
            oneline_match_name = ""
            print(":: {}".format(match_group["name"]), end="")
        sys.stdout.flush()

        found_threads_amount = 0

        for thread in threads:
            log.debug("Thread {} matches keyword {}.".format(thread["url_short"], thread["matching_keyword"]))

            found_threads_amount += 1
            if options["no_duplicate_threads"]:
                if thread["url_short"] in found_threads_urls:
                    found_threads_urls.append(thread["url_short"])
                    continue

            if not options["output_oneline"] and found_threads_amount == 1:
                print()
            else:
                utils.clear_terminal_line()

            found_threads_urls.append(thread["url_short"])
            thread_date = datetime.datetime.fromtimestamp(thread["timestamp"])
            term_len = utils.get_terminal_line_len()
            output_prefix = "   "

            if thread["url_short"] not in found_threads_urls_prev:
                output_prefix = " {}!{} ".format(colour.get("IGreen"), colour.get("Reset"))
                if options["colour_output"]:
                    # The colour code len for the new thread indicator ("!").
                    term_len += 11

            output_page = thread["page"]
            if options["colour_output"]:
                if output_page <= 3:
                    page_colour = colour.get("IGreen")
                elif output_page <= 7:
                    page_colour = colour.get("IYellow")
                elif output_page >= 8:
                    page_colour = colour.get("IRed")
            else:
                page_colour = ""

            output = (
                "{prefix}{match_name}/{board:<3}  {date}  {replies:<3}  "
                "{page_col}{page:<2}{reset}  {url:<45}  ".format(
                    board=thread["board"] + "/",
                    replies=thread["replies"],
                    page=output_page,
                    url=thread["url"],
                    page_col=page_colour,
                    date=utils.pretty_date_delta(thread_date),
                    reset=colour.get("Reset"),
                    prefix=output_prefix,
                    match_name=oneline_match_name,
                )
            )
            thread_subject = thread.get("subject", False)
            if thread_subject:
                thread_subject = thread_subject.encode("ascii", "replace").decode("ascii", "replace")
                thread_subject = utils.strip_html_tags(thread_subject)
                thread_subject = html.unescape(thread_subject)
                output += "sub: {}".format(thread_subject)
            thread_comment = thread.get("comment", False)
            if thread_comment:
                thread_comment = thread_comment.encode("ascii", "replace").decode("ascii", "replace")
                thread_comment = thread_comment.replace("<br>", " ")
                thread_comment = utils.strip_html_tags(thread_comment)
                thread_comment = html.unescape(thread_comment)
                if thread_subject:
                    output += " | "
                output += "com: {}".format(thread_comment)
            if options["colour_output"]:
                # The colour code len for the page number.
                term_len += 11
            print(output[:term_len])

            if thread["url_short"] not in found_threads_urls_prev:
                new_found_threads_amount += 1
                if match_group["urlsavelast"]:
                    urlsavelast_dir_path = os.path.join(
                        options["main_dir_path"],
                        "urls_last",
                        match_group["name"],
                        date_start.strftime("%Y"),
                        date_start.strftime("%Y-%m"),
                    )
                    with contextlib.suppress(FileExistsError):
                        os.makedirs(urlsavelast_dir_path)
                    urlsavelast_file_path = os.path.join(urlsavelast_dir_path, date_start.strftime("%Y-%m-%d"))
                    utils.append_data_to_file(thread["url"] + "\n", urlsavelast_file_path)
                    log.info("Saved thread url {} to file {}".format(thread["url"], urlsavelast_file_path))
                if match_group["browser"]:
                    try:
                        utils.open_in_web_browser(thread["url"])
                    except utils.Error as err:
                        log.error("{}.".format(err))
        if found_threads_amount == 0:
            utils.clear_terminal_line()
            continue
        else:
            all_found_threads_amount += found_threads_amount
    with contextlib.suppress(FileNotFoundError):
        os.remove(url_cache_file_path)
    for url in found_threads_urls:
        utils.append_data_to_file(url + "\n", url_cache_file_path)
    dead_threads_amount = 0
    for url in found_threads_urls_prev:
        if url not in found_threads_urls:
            dead_threads_amount += 1

    date_end = datetime.datetime.now()
    date_next_refresh = date_end + datetime.timedelta(seconds=options["refresh_time"])
    print(
        "\n"
        "{} thread{}, {} unique; {} new; "
        "{} that matched on the previous run but not now.\n"
        "\n"
        "Start time:    {}.\n"
        "End time:      {}.\n"
        "Next refresh:  {}.".format(
            all_found_threads_amount,
            "" if all_found_threads_amount == 1 else "s",
            len(set(found_threads_urls)),
            "no" if new_found_threads_amount == 0 else new_found_threads_amount,
            dead_threads_amount,
            date_start.strftime("%Y-%m-%d %H:%M:%S"),
            date_end.strftime("%Y-%m-%d %H:%M:%S"),
            date_next_refresh.strftime("%Y-%m-%d %H:%M:%S"),
        ),
        end="",
    )

Пример #6

0

Показать файл

Файл: poster.py Проект: ianmintz/pourover

def format_for_adn(feed, entry):
    post_text = entry.title
    links = []
    summary_text = ''
    if feed.include_summary:
        summary_text = strip_html_tags(entry.summary)
        sentances = list(splitter.split(summary_text))
        sentances.reverse()
        summary_text = sentances.pop()
        while len(summary_text) <= 200:
            try:
                next_sentance = sentances.pop()
            except IndexError:
                break

            if len(summary_text + ' ' + next_sentance) <= 200:
                summary_text += ' ' + next_sentance

        summary_text = ellipse_text(summary_text, 200)

    if entry.feed_item:
        link = get_link_for_item(feed, entry.feed_item)
    else:
        link = entry.link

    link = iri_to_uri(link)
    link = append_query_string(link, params={'utm_source': 'PourOver', 'utm_medium': 'App.net'})

    # If viewing feed from preview don't shorten urls
    preview = getattr(feed, 'preview', False)
    has_own_bitly_creds = feed.bitly_login and feed.bitly_api_key
    if has_own_bitly_creds or feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK:
        if not has_own_bitly_creds:
            feed.bitly_login = '******'
            feed.bitly_api_key = 'R_a1311cd1785b7da2aedac9703656b0f1'

        short_url = yield get_short_url(entry, link, feed)
        if short_url:
            link = short_url

    # Starting out it should be as long as it can be
    max_chars = MAX_CHARS
    max_link_chars = 40
    ellipse_link_text = ellipse_text(link, max_link_chars)
    # If the link is to be included in the text we need to make sure we reserve enough space at the end
    if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK:
        max_chars -= len(' ' + ellipse_link_text)

    # Should be some room for a description
    if len(post_text) < (max_chars - 40) and summary_text:
        post_text = u'%s\n%s' % (post_text, summary_text)

    post_text = ellipse_text(post_text, max_chars)
    if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK:
        post_text += ' ' + ellipse_link_text

    if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK:
        links.insert(0, (link, ellipse_link_text))
    else:
        links.insert(0, (link, entry.title))

    link_entities = []
    index = 0
    for href, link_text in links:
        # logger.info('Link info: %s %s %s', post_text, link_text, index)
        text_index = post_text.find(link_text, index)
        if text_index > -1:
            link_entities.append({
                'url': href,
                'text': link_text,
                'pos': text_index,
                'len': len(link_text),
            })
            index = text_index

    post = {
        'text': post_text,
        'annotations': [cross_post_annotation(link)]
    }

    if link_entities:
        post['entities'] = {
            'links': link_entities,
        }

    # logger.info('Info %s, %s', include_thumb, self.thumbnail_image_url)
    if feed.include_thumb and entry.thumbnail_image_url:
        post['annotations'].append(image_annotation_for_entry(entry))

    if feed.include_video and entry.video_oembed:
        oembed = entry.video_oembed
        oembed['embeddable_url'] = entry.link
        post['annotations'].append({
            "type": "net.app.core.oembed",
            "value": oembed
        })

    lang = get_language(entry.language)
    if lang:
        post['annotations'].append({
            "type": "net.app.core.language",
            "value": {
                "language": lang,
            }
        })

    if entry.author:
        post['annotations'].append({
            "type": "net.app.pourover.item.author",
            "value": {
                "author": entry.author,
            }
        })

    if entry.tags:
        post['annotations'].append({
            "type": "net.app.pourover.item.tags",
            "value": {
                "tags": entry.tags,
            }
        })

    raise ndb.Return(post)

Пример #7

0

Показать файл

def format_for_adn(feed, entry):
    post_text = entry.title
    links = []
    summary_text = ''
    if feed.include_summary:
        summary_text = strip_html_tags(entry.summary)
        sentances = list(splitter.split(summary_text))
        sentances.reverse()
        summary_text = sentances.pop()
        while len(summary_text) <= 200:
            try:
                next_sentance = sentances.pop()
            except IndexError:
                break

            if len(summary_text + ' ' + next_sentance) <= 200:
                summary_text += ' ' + next_sentance

        summary_text = ellipse_text(summary_text, 200)

    link = format_link_for_entry(feed, entry)
    has_own_bitly_creds = feed.bitly_login and feed.bitly_api_key
    if has_own_bitly_creds or feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK:
        if not has_own_bitly_creds:
            feed.bitly_login = '******'
            feed.bitly_api_key = 'R_a1311cd1785b7da2aedac9703656b0f1'

        short_url = yield get_short_url(entry, link, feed)
        if short_url:
            link = short_url

    # Starting out it should be as long as it can be
    max_chars = MAX_CHARS
    max_link_chars = 40
    ellipse_link_text = ellipse_text(link, max_link_chars)
    # If the link is to be included in the text we need to make sure we reserve enough space at the end
    if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK:
        max_chars -= len(' ' + ellipse_link_text)

    # Should be some room for a description
    if len(post_text) < (max_chars - 40) and summary_text:
        post_text = u'%s\n%s' % (post_text, summary_text)

    post_text = ellipse_text(post_text, max_chars)
    if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK:
        post_text += ' ' + ellipse_link_text

    if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK:
        links.insert(0, (link, ellipse_link_text))
    else:
        links.insert(0, (link, entry.title))

    link_entities = []
    index = 0
    for href, link_text in links:
        # logger.info('Link info: %s %s %s', post_text, link_text, index)
        text_index = post_text.find(link_text, index)
        if text_index > -1:
            link_entities.append({
                'url': href,
                'text': link_text,
                'pos': text_index,
                'len': len(link_text),
            })
            index = text_index

    post = {
        'text': post_text,
        'annotations': [cross_post_annotation(link)]
    }

    if link_entities:
        post['entities'] = {
            'links': link_entities,
        }

    # logger.info('Info %s, %s', include_thumb, self.thumbnail_image_url)
    if feed.include_thumb and entry.thumbnail_image_url:
        post['annotations'].append(image_annotation_for_entry(entry))

    if feed.include_video and entry.video_oembed:
        oembed = entry.video_oembed
        oembed['embeddable_url'] = entry.link
        post['annotations'].append({
            "type": "net.app.core.oembed",
            "value": oembed
        })

    post['annotations'] += common_annotations(entry)

    raise ndb.Return(post)

Пример #8

0

Показать файл

Файл: define.py Проект: shacknetisp/fourthevaz

 def sanitize(s):
     s = utils.strip_xml_ampcodes(s)
     hs = utils.strip_html_tags(s)
     if hs:
         return hs.replace('\n', '')
     return s.replace('\n', '')

Python strip_html_tags примеры использования