def create_article(title, content, owner_login, user_suggestion): if len(content) > MAX_ARTICLE_CONTENT_LENGTH or len( title) > MAX_TITLE_LENGTH: return False user = User.get(User.name == owner_login) if user_suggestion is None: owner = user else: owner = User.get(User.name == user_suggestion) articles_count = user.articles_count html_content = markdown.markdown(content, safe_mode='escape', extensions=[TocExtension(baselevel=3)]) stripped_text = strip_html_tags(html_content) preview_text = stripped_text[:MAX_ARTICLE_PREVIEW_TEXT_LENGTH] if len(stripped_text) > MAX_ARTICLE_PREVIEW_TEXT_LENGTH: preview_text += '...' Article.create(title=html.escape(title), content=html_content, preview_text=preview_text, owner=owner, is_draft=user_suggestion is not None).save() if user_suggestion is None: User\ .update({User.articles_count: articles_count + 1})\ .where(User.id == owner)\ .execute() return True
def getibiblio(word): html = \ urllib.request.urlopen( 'http://www.ibiblio.org/webster/cgi-bin/headword_search.pl?query=' + word.replace(' ', '+')).read().decode() s = re.sub(' +', ' ', utils.find_between(html, '<def>', '</def>' ).strip()) + '.' return utils.strip_html_tags(s)
def parse_post(post): # Extract text and strip html tags and links content = strip_html_tags(post['the_post']['rawContent']) links = extract_urls(content) for url in links: content = content.replace(url, '') try: images = list( map(lambda x: x['cdnUrl'], post['the_post']['entities']['images'])) except KeyError: images = [] return content, links, images
def broadcast_format_for_adn(feed, entry): #summary = clean_html(entry.summary) # Easy path for now, leave some space at the end for cleaning up of broken HTML #summary = ellipse_text(summary, 2000) # This will try and fix broken html #summary = html.tostring(html.fromstring(summary)) link = format_link_for_entry(feed, entry, medium='App.net Broadcast') post = { 'annotations': [metadata_annotation(entry), cross_post_annotation(link)] } post['annotations'] += common_annotations(entry) if feed.include_thumb and entry.thumbnail_image_url: post['annotations'].append(image_annotation_for_entry(entry)) description = None if entry.meta_tags: og_description = entry.meta_tags.get('og', {}).get('description') twitter_description = entry.meta_tags.get('twitter', {}).get('description') description = og_description or twitter_description if description: # logger.info("What are we striping html from: %s %s", description, type(description)) description = unicode(description) description = strip_html_tags(description) if description: post['text'] = description else: post['machine_only'] = True if feed.cross_post_to_defaults: post['publish_to'] = {'defaults': True} return post
def main(options, profile): try: match_groups = parse_config.parse_file(os.path.join(options["main_dir_path"], "chtf-conf.yaml")) except parse_config.ConfigError as err: log.critical("{}.".format(err)) utils.exit(1) if options["output_oneline"]: longest_name_len = len(max([match_group["name"] for match_group in match_groups], key=len)) found_threads_urls_prev = [] url_cache_file_path = os.path.join(options["cache_dir_path"], "prev_urls") with contextlib.suppress(FileNotFoundError): for line in utils.read_lines_from_file(url_cache_file_path): found_threads_urls_prev.append(line.strip()) all_found_threads_amount = 0 new_found_threads_amount = 0 colour = TerminalColour(options["colour_output"]) board_cache_dir_path = os.path.join(options["cache_dir_path"], "boards") found_threads_urls = [] date_start = datetime.datetime.now() for match_group, threads in core.generate_threads( options["chan"], match_groups, board_cache_dir_path, options["core_dl_sleep_time"] ): if options["output_oneline"]: oneline_match_name = "{:{}} ".format(match_group["name"], longest_name_len) print(" {}".format(oneline_match_name), end="") else: oneline_match_name = "" print(":: {}".format(match_group["name"]), end="") sys.stdout.flush() found_threads_amount = 0 for thread in threads: log.debug("Thread {} matches keyword {}.".format(thread["url_short"], thread["matching_keyword"])) found_threads_amount += 1 if options["no_duplicate_threads"]: if thread["url_short"] in found_threads_urls: found_threads_urls.append(thread["url_short"]) continue if not options["output_oneline"] and found_threads_amount == 1: print() else: utils.clear_terminal_line() found_threads_urls.append(thread["url_short"]) thread_date = datetime.datetime.fromtimestamp(thread["timestamp"]) term_len = utils.get_terminal_line_len() output_prefix = " " if thread["url_short"] not in found_threads_urls_prev: output_prefix = " {}!{} ".format(colour.get("IGreen"), colour.get("Reset")) if options["colour_output"]: # The colour code len for the new thread indicator ("!"). term_len += 11 output_page = thread["page"] if options["colour_output"]: if output_page <= 3: page_colour = colour.get("IGreen") elif output_page <= 7: page_colour = colour.get("IYellow") elif output_page >= 8: page_colour = colour.get("IRed") else: page_colour = "" output = ( "{prefix}{match_name}/{board:<3} {date} {replies:<3} " "{page_col}{page:<2}{reset} {url:<45} ".format( board=thread["board"] + "/", replies=thread["replies"], page=output_page, url=thread["url"], page_col=page_colour, date=utils.pretty_date_delta(thread_date), reset=colour.get("Reset"), prefix=output_prefix, match_name=oneline_match_name, ) ) thread_subject = thread.get("subject", False) if thread_subject: thread_subject = thread_subject.encode("ascii", "replace").decode("ascii", "replace") thread_subject = utils.strip_html_tags(thread_subject) thread_subject = html.unescape(thread_subject) output += "sub: {}".format(thread_subject) thread_comment = thread.get("comment", False) if thread_comment: thread_comment = thread_comment.encode("ascii", "replace").decode("ascii", "replace") thread_comment = thread_comment.replace("<br>", " ") thread_comment = utils.strip_html_tags(thread_comment) thread_comment = html.unescape(thread_comment) if thread_subject: output += " | " output += "com: {}".format(thread_comment) if options["colour_output"]: # The colour code len for the page number. term_len += 11 print(output[:term_len]) if thread["url_short"] not in found_threads_urls_prev: new_found_threads_amount += 1 if match_group["urlsavelast"]: urlsavelast_dir_path = os.path.join( options["main_dir_path"], "urls_last", match_group["name"], date_start.strftime("%Y"), date_start.strftime("%Y-%m"), ) with contextlib.suppress(FileExistsError): os.makedirs(urlsavelast_dir_path) urlsavelast_file_path = os.path.join(urlsavelast_dir_path, date_start.strftime("%Y-%m-%d")) utils.append_data_to_file(thread["url"] + "\n", urlsavelast_file_path) log.info("Saved thread url {} to file {}".format(thread["url"], urlsavelast_file_path)) if match_group["browser"]: try: utils.open_in_web_browser(thread["url"]) except utils.Error as err: log.error("{}.".format(err)) if found_threads_amount == 0: utils.clear_terminal_line() continue else: all_found_threads_amount += found_threads_amount with contextlib.suppress(FileNotFoundError): os.remove(url_cache_file_path) for url in found_threads_urls: utils.append_data_to_file(url + "\n", url_cache_file_path) dead_threads_amount = 0 for url in found_threads_urls_prev: if url not in found_threads_urls: dead_threads_amount += 1 date_end = datetime.datetime.now() date_next_refresh = date_end + datetime.timedelta(seconds=options["refresh_time"]) print( "\n" "{} thread{}, {} unique; {} new; " "{} that matched on the previous run but not now.\n" "\n" "Start time: {}.\n" "End time: {}.\n" "Next refresh: {}.".format( all_found_threads_amount, "" if all_found_threads_amount == 1 else "s", len(set(found_threads_urls)), "no" if new_found_threads_amount == 0 else new_found_threads_amount, dead_threads_amount, date_start.strftime("%Y-%m-%d %H:%M:%S"), date_end.strftime("%Y-%m-%d %H:%M:%S"), date_next_refresh.strftime("%Y-%m-%d %H:%M:%S"), ), end="", )
def format_for_adn(feed, entry): post_text = entry.title links = [] summary_text = '' if feed.include_summary: summary_text = strip_html_tags(entry.summary) sentances = list(splitter.split(summary_text)) sentances.reverse() summary_text = sentances.pop() while len(summary_text) <= 200: try: next_sentance = sentances.pop() except IndexError: break if len(summary_text + ' ' + next_sentance) <= 200: summary_text += ' ' + next_sentance summary_text = ellipse_text(summary_text, 200) if entry.feed_item: link = get_link_for_item(feed, entry.feed_item) else: link = entry.link link = iri_to_uri(link) link = append_query_string(link, params={'utm_source': 'PourOver', 'utm_medium': 'App.net'}) # If viewing feed from preview don't shorten urls preview = getattr(feed, 'preview', False) has_own_bitly_creds = feed.bitly_login and feed.bitly_api_key if has_own_bitly_creds or feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK: if not has_own_bitly_creds: feed.bitly_login = '******' feed.bitly_api_key = 'R_a1311cd1785b7da2aedac9703656b0f1' short_url = yield get_short_url(entry, link, feed) if short_url: link = short_url # Starting out it should be as long as it can be max_chars = MAX_CHARS max_link_chars = 40 ellipse_link_text = ellipse_text(link, max_link_chars) # If the link is to be included in the text we need to make sure we reserve enough space at the end if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK: max_chars -= len(' ' + ellipse_link_text) # Should be some room for a description if len(post_text) < (max_chars - 40) and summary_text: post_text = u'%s\n%s' % (post_text, summary_text) post_text = ellipse_text(post_text, max_chars) if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK: post_text += ' ' + ellipse_link_text if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK: links.insert(0, (link, ellipse_link_text)) else: links.insert(0, (link, entry.title)) link_entities = [] index = 0 for href, link_text in links: # logger.info('Link info: %s %s %s', post_text, link_text, index) text_index = post_text.find(link_text, index) if text_index > -1: link_entities.append({ 'url': href, 'text': link_text, 'pos': text_index, 'len': len(link_text), }) index = text_index post = { 'text': post_text, 'annotations': [cross_post_annotation(link)] } if link_entities: post['entities'] = { 'links': link_entities, } # logger.info('Info %s, %s', include_thumb, self.thumbnail_image_url) if feed.include_thumb and entry.thumbnail_image_url: post['annotations'].append(image_annotation_for_entry(entry)) if feed.include_video and entry.video_oembed: oembed = entry.video_oembed oembed['embeddable_url'] = entry.link post['annotations'].append({ "type": "net.app.core.oembed", "value": oembed }) lang = get_language(entry.language) if lang: post['annotations'].append({ "type": "net.app.core.language", "value": { "language": lang, } }) if entry.author: post['annotations'].append({ "type": "net.app.pourover.item.author", "value": { "author": entry.author, } }) if entry.tags: post['annotations'].append({ "type": "net.app.pourover.item.tags", "value": { "tags": entry.tags, } }) raise ndb.Return(post)
def format_for_adn(feed, entry): post_text = entry.title links = [] summary_text = '' if feed.include_summary: summary_text = strip_html_tags(entry.summary) sentances = list(splitter.split(summary_text)) sentances.reverse() summary_text = sentances.pop() while len(summary_text) <= 200: try: next_sentance = sentances.pop() except IndexError: break if len(summary_text + ' ' + next_sentance) <= 200: summary_text += ' ' + next_sentance summary_text = ellipse_text(summary_text, 200) link = format_link_for_entry(feed, entry) has_own_bitly_creds = feed.bitly_login and feed.bitly_api_key if has_own_bitly_creds or feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK: if not has_own_bitly_creds: feed.bitly_login = '******' feed.bitly_api_key = 'R_a1311cd1785b7da2aedac9703656b0f1' short_url = yield get_short_url(entry, link, feed) if short_url: link = short_url # Starting out it should be as long as it can be max_chars = MAX_CHARS max_link_chars = 40 ellipse_link_text = ellipse_text(link, max_link_chars) # If the link is to be included in the text we need to make sure we reserve enough space at the end if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK: max_chars -= len(' ' + ellipse_link_text) # Should be some room for a description if len(post_text) < (max_chars - 40) and summary_text: post_text = u'%s\n%s' % (post_text, summary_text) post_text = ellipse_text(post_text, max_chars) if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK: post_text += ' ' + ellipse_link_text if feed.format_mode == FORMAT_MODE.TITLE_THEN_LINK: links.insert(0, (link, ellipse_link_text)) else: links.insert(0, (link, entry.title)) link_entities = [] index = 0 for href, link_text in links: # logger.info('Link info: %s %s %s', post_text, link_text, index) text_index = post_text.find(link_text, index) if text_index > -1: link_entities.append({ 'url': href, 'text': link_text, 'pos': text_index, 'len': len(link_text), }) index = text_index post = { 'text': post_text, 'annotations': [cross_post_annotation(link)] } if link_entities: post['entities'] = { 'links': link_entities, } # logger.info('Info %s, %s', include_thumb, self.thumbnail_image_url) if feed.include_thumb and entry.thumbnail_image_url: post['annotations'].append(image_annotation_for_entry(entry)) if feed.include_video and entry.video_oembed: oembed = entry.video_oembed oembed['embeddable_url'] = entry.link post['annotations'].append({ "type": "net.app.core.oembed", "value": oembed }) post['annotations'] += common_annotations(entry) raise ndb.Return(post)
def sanitize(s): s = utils.strip_xml_ampcodes(s) hs = utils.strip_html_tags(s) if hs: return hs.replace('\n', '') return s.replace('\n', '')