def format_content(value): value = value.replace('</p>', '<br>') value = bleach.clean(value, allowed_tags, allowed_attributes, allowed_styles, strip=True) soup = BeautifulSoup(value) tags = soup.find_all('img') for tag in tags: if tag is not None: if "style" in unicode(tag): width = re.findall(r'\d+', tag['style']) style = tag['style'] if width: if int(width[0]) > 280 or style == u'width: 100%;': tag['style'] = 'width:100%;vertical-align:middle' else: tag['style'] = 'width:%spx;vertical-align:middle' \ % width[0] if soup.body: body = get_content(soup) remove_tags(unicode(body), "body") output = soup.new_tag("div") list = body.contents[:] for content in list: output.append(content) return unicode(output)
def getPreview(self,obj): pattern = re.compile('<preview .*?>(.*?)</preview>') text_preview = pattern.findall(obj.content) try: return remove_tags(text_preview,'preview') except: return ('%s...') % remove_tags(obj.content[:150],'preview')
def xt(cls, step_selector): title_selector = step_selector.xpath('//td[2]')[0] # we have wortmeldungen! if title_selector.xpath('//table'): table_selector = title_selector.xpath('//table')[0] raw_rows = [ Selector(text=raw_row) for raw_row in table_selector.xpath('//tbody//tr').extract() ] statements = [] # Extract statements data for index, row_selector in enumerate(raw_rows): try: person_source_link = row_selector.xpath( cls.XP_P_LINK).extract()[0] person_name = row_selector.xpath( cls.XP_P_NAME).extract() statement_type = _clean( row_selector.xpath( cls.XP_T_TYPE).extract()[0]) protocol_link = row_selector.xpath( cls.XP_PROT_LINK).extract() protocol_text = _clean( remove_tags( row_selector.xpath( cls.XP_PROT_TEXT).extract()[0], 'td a')) statements.append({ 'index': index, 'person_source_link': person_source_link, 'person_name': person_name, 'statement_type': statement_type, 'protocol_link': protocol_link, 'protocol_text': protocol_text, }) except: logger.error( "Skipping statement '{}' due to extraction error" .format(row_selector.extract())) continue title = { 'text': u'Wortmeldungen in der Debatte', 'statements': statements } else: text = _clean( remove_tags( step_selector.xpath(cls.XPATH).extract()[0], 'td')).replace('<a href="', '<a href="{}'.format(BASE_HOST)) title = {'text': text} return title
def xt(cls, step_selector): title_selector = step_selector.xpath('//td[2]')[0] # we have wortmeldungen! if title_selector.xpath('//table'): table_selector = title_selector.xpath('//table')[0] raw_rows = [ Selector(text=raw_row) for raw_row in table_selector.xpath('//tbody//tr').extract()] statements = [] # Extract statements data for index, row_selector in enumerate(raw_rows): if(row_selector.xpath(cls.XP_P_LINK).extract()): person_source_link = row_selector.xpath( cls.XP_P_LINK).extract()[0] else: continue person_name = row_selector.xpath( cls.XP_P_NAME).extract() if(row_selector.xpath(cls.XP_T_TYPE).extract()): statement_type = _clean( row_selector.xpath(cls.XP_T_TYPE).extract()[0]) else: continue protocol_link = row_selector.xpath( cls.XP_PROT_LINK).extract() if(row_selector.xpath( cls.XP_PROT_TEXT).extract()): protocol_text = _clean( remove_tags( row_selector.xpath( cls.XP_PROT_TEXT).extract()[0], 'td a')) else: protocol_text = [] statements.append({ 'index': index, 'person_source_link': person_source_link, 'person_name': person_name, 'statement_type': statement_type, 'protocol_link': protocol_link, 'protocol_text': protocol_text, }) title = { 'text': u'Wortmeldungen in der Debatte', 'statements': statements } else: text = _clean( remove_tags( step_selector.xpath( cls.XPATH).extract()[0], 'td')).replace('<a href="', '<a href="{}'.format(BASE_HOST)) title = {'text': text} return title
def xt(cls, response): mandates_raw = response.xpath(cls.XPATH).extract() mandates = [] for mandate in mandates_raw: mandate = _clean(remove_tags(mandate, 'li')) if "<div" in mandate and "</div>" in mandate: mandate = _clean( remove_tags( Selector( text=mandate).xpath("//div").extract()[0], 'div')) function = mandate.split(u'<br>')[0].split(',')[0] party = mandate.split(u'<br>')[0].split(',')[1] # Start Date try: start_date = _clean( mandate.split('<br>')[1].split(u'\u2013')[0]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() except: logger.error( u"Failed to parse mandate start date: {}".format( start_date)) start_date = None # End Date try: end_date = mandate.split('<br>')[1].split(u'\u2013') if len(end_date) > 1 and end_date[1]: end_date = datetime.datetime.strptime( _clean(end_date[1]), "%d.%m.%Y").date() else: end_date = None except: logger.error( u"Failed to parse mandate end date: {}".format( end_date)) end_date = None mandates.append({ 'function': function, 'party': _clean(party), 'start_date': start_date, 'end_date': end_date, }) return mandates
def xt(cls, response): mandates_raw = response.xpath(cls.XPATH).extract() mandates = [] for mandate in mandates_raw: mandate = _clean(remove_tags(mandate, 'li')) if "<div" in mandate and "</div>" in mandate: mandate = _clean(remove_tags( Selector(text=mandate).xpath("//div").extract()[0], 'div')) function = mandate.split(u'<br>')[0].split(',')[0] party = mandate.split(u'<br>')[0].split(',')[1] # Start Date try: start_date = _clean( mandate.split('<br>')[1].split(u'\u2013')[0]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() except: logger.error( u"Failed to parse mandate start date: {}".format(start_date)) start_date = None # End Date try: end_date = mandate.split( '<br>')[1].split(u'\u2013') if len(end_date) > 1 and end_date[1]: end_date = datetime.datetime.strptime( _clean(end_date[1]), "%d.%m.%Y").date() else: end_date = None except: logger.error( u"Failed to parse mandate end date: {}".format(end_date)) end_date = None mandates.append({ 'function': function, 'party': _clean(party), 'start_date': start_date, 'end_date': end_date, }) return mandates
def xt(cls, response): persons = [] raw_persons = response.xpath(cls.XPATH).extract() for raw_person in raw_persons: person = Selector(text=raw_person) if person.xpath('//th'): continue source_link = person.xpath( '//td//a/@href').extract()[0] reversed_name = _clean( Selector( text=remove_tags(raw_person, 'img') ).xpath('//td//a/text()').extract()[0]) (pres_start_date, pres_end_date) = cls.xt_pres_date( raw_person) mandate = { 'title': u'RechnungshofpräsidentIn', 'short': u'RH-PräsidentIn', 'start_date': pres_start_date, 'end_date': pres_end_date } persons.append({ 'source_link': source_link, 'reversed_name': reversed_name, 'mandate': mandate, }) return persons
def bbs_pub(request): categories = Category.objects.all() hashkey = CaptchaStore.generate_key() image_url = captcha_image_url(hashkey) if request.method == 'POST': form = BbsPubForm(request.POST) if form.is_valid(): cd = form.cleaned_data bbsBiz = BbsBiz() bbs_category = bbsBiz.getCategory(cd['bbs_category']) bbs_author = bbsBiz.getBbsAuthorByReq(request.user) if bbs_category and bbs_author: bbs_content = remove_tags(cd['bbs_content'], "html body script") BBS.objects.create( bbs_title = cd['bbs_title'], bbs_content = bbs_content, view_count = 0, bbs_category = bbs_category, bbs_author = bbs_author, ) return HttpResponseRedirect(reverse('home')) return render_to_response("bbs_pub.html", {"form": form, "categories": categories, "hashkey": hashkey, "image_url": image_url}, context_instance = RequestContext(request)) form = BbsPubForm() return render_to_response("bbs_pub.html", {"form": form, "categories": categories, "hashkey": hashkey, "image_url": image_url}, context_instance = RequestContext(request))
def xt(cls, response): description = response.xpath(cls.XPATH).extract() if description: description = description[0] else: description = u"" return remove_tags(description, 'p')
def cockpit_page_result_list(cl): """ Displays the headers and data list together Replaces admin template tag "result_list". Constructs page list according to the hierarchical structure. """ headers = list(result_headers(cl)) num_sorted_fields = 0 for h in headers: if h['sortable'] and h['sorted']: num_sorted_fields += 1 page_results = cl.result_list ordered_results = create_ordered_page_list(page_results) cl.result_list = ordered_results['ordered_list'] hierarchy_levels = ordered_results['hierarchy_levels'] list_results = list(results(cl)) # Hierarchical indentation i = 0 for result in list_results: result[1] = remove_tags(result[1], "th") result[1] = mark_safe(u"<th style='padding-left: %dpx;'>%s</th>" % (5 + hierarchy_levels[i] * 20, result[1])) i += 1 return {'cl': cl, 'result_hidden_fields': list(result_hidden_fields(cl)), 'result_headers': headers, 'num_sorted_fields': num_sorted_fields, 'results': list_results}
def xt(cls, response): try: description = response.xpath(cls.XPATH)[0].extract()[0] except: import ipdb ipdb.set_trace() return remove_tags(description, 'p')
def xt(cls, response): description = response.xpath(cls.XPATH).extract() if description: description = description[0] else: description = u"" description_nowhitespace = re.sub('\s+',' ',description) return remove_tags(description_nowhitespace, 'p').strip()
def xt(cls, response): try: description = response.xpath(cls.XPATH)[0].extract()[0] except: import ipdb ipdb.set_trace() return remove_tags(description, "p")
def xt(cls, response): description = response.xpath(cls.XPATH).extract() if description: description = description[0] else: description = u"" description_nowhitespace = re.sub('\s+', ' ', description) return remove_tags(description_nowhitespace, 'p').strip()
def xt(cls, step_selector): title_selector = step_selector.xpath("//td[2]")[0] # we have wortmeldungen! if title_selector.xpath("//table"): table_selector = title_selector.xpath("//table")[0] raw_rows = [Selector(text=raw_row) for raw_row in table_selector.xpath("//tbody//tr").extract()] statements = [] # Extract statements data for index, row_selector in enumerate(raw_rows): if row_selector.xpath(cls.XP_P_LINK).extract(): person_source_link = row_selector.xpath(cls.XP_P_LINK).extract()[0] else: continue person_name = row_selector.xpath(cls.XP_P_NAME).extract() if row_selector.xpath(cls.XP_T_TYPE).extract(): statement_type = _clean(row_selector.xpath(cls.XP_T_TYPE).extract()[0]) else: continue protocol_link = row_selector.xpath(cls.XP_PROT_LINK).extract() if row_selector.xpath(cls.XP_PROT_TEXT).extract(): protocol_text = _clean( remove_tags(row_selector.xpath(cls.XP_PROT_TEXT).extract()[0], "td a") ) else: protocol_text = [] statements.append( { "index": index, "person_source_link": person_source_link, "person_name": person_name, "statement_type": statement_type, "protocol_link": protocol_link, "protocol_text": protocol_text, } ) title = {"text": u"Wortmeldungen in der Debatte", "statements": statements} else: text = _clean(remove_tags(step_selector.xpath(cls.XPATH).extract()[0], "td")).replace( '<a href="', '<a href="{}'.format(BASE_HOST) ) title = {"text": text} return title
def xt(cls, response): persons = [] raw_persons = response.xpath(cls.XPATH).extract() for raw_person in raw_persons: person = Selector(text=raw_person) if person.xpath('//th'): continue source_link = person.xpath( '//td//a/@href').extract()[0] reversed_name = _clean( Selector( text=remove_tags(raw_person, 'img') ).xpath('//td//a/text()').extract()[0]) if ' siehe ' in reversed_name: reversed_name = reversed_name.split(' siehe ')[1] admin_title = person.xpath( '//td[1]/span/text()').extract() (admin_start_date, admin_end_date) = cls.xt_admin_date( raw_person) administration = { 'title': admin_title, 'start_date': admin_start_date, 'end_date': admin_end_date } # TODO EXTRACT DATE(S) FROM BUNDESMINISTERIUM td # TODO ADD EITHER DATE(S) TO FUNCTION try: if person.xpath('//tr//td[3]/span/text()'): function_short = person.xpath( '//td[3]/span/text()').extract()[0] function_title = person.xpath( '//td[3]/span/@title').extract()[0] elif person.xpath('//tr//td[3]/text()'): function_short = _clean(person.xpath( '//td[3]/text()').extract()[0]) function_title = '' except: import ipdb ipdb.set_trace() mandate = { 'short': function_short, 'title': function_title, 'administration': administration} persons.append({ 'source_link': source_link, 'reversed_name': reversed_name, 'mandate': mandate, }) return persons
def remove_tags_and_comments(): global s htmlcomments = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>') #print 'BEFORE:', s s = remove_tags (s, 'table thead tfoot tbody td tr th font TABLE THEAD TFOOT TBODY TD TR TH FONT center CENTER EM em span SPAN') # p br div P BR DIV s = htmlcomments.sub ('', s) s = s.strip('\r\n\t') s = s.replace (' ','') #s, errs = tidy_fragment (s, options= {'indent':1, 'wrap': 120, 'merge-divs': 'yes'}) if trace: print errs return True
def get(self, request, slot): check_schedule_view(request) try: slot_id = int(slot) slot = get_object_or_404(Slot, pk=slot_id) if slot.content_ptr.slug: return redirect(slot.get_absolute_url(), permanent=True) except ValueError: slot = get_object_or_404(Slot, content_ptr__slug=slot) data = { "slot": slot, "biography": mark_safe(remove_tags(slot.content.speaker.biography.rendered, 'script')) } return render(request, self.template_name, data)
def format_option(value): value = value.replace('</p>', '<br>') value = bleach.clean(value, allowed_tags, allowed_attributes, allowed_styles, strip=True) soup = BeautifulSoup(value) tags = soup.find_all('img') if tags: for tag in tags: tag['style'] = 'vertical-align:middle' if soup.body: body = get_content(soup) output = remove_tags(unicode(body), "body") return unicode(output)
def replays(request, bbs_id): if request.is_ajax(): form = ReplayForm(request.POST) content = request.POST.get('content', None) # 過濾掉不安全標籤 content = remove_tags(content, "script html body") if form.is_valid() and content and bbs_id: Comments.objects.create( user_id = BBS_user.objects.get(user__username=request.user), bbs_id = BBS.objects.get(id=bbs_id), pub_date = datetime.datetime.now(), cmt_content = content ) cur_user = BBS_user.objects.get(user__username=request.user) # 傳入用戶頭像地址,用於ajax avatar = unicode(cur_user.avatar) return HttpResponse(json.dumps( {"content": content, "avatar": avatar, "signature": cur_user.signature })) return None return HttpResponseRedirect(reverse("home"))
def removetags(value, tags): """Removes a space separated list of [X]HTML tags from the output.""" from django.utils.html import remove_tags return remove_tags(value, tags)
def removetags(value, tags): """Removes a space separated list of [X]HTML tags from the output.""" return remove_tags(value, tags)
def __unicode__(self): try: return remove_tags(self.title, 'a') except: return self.title
def remove_p_and_br(value): return remove_tags(value, 'p br')
def createJsonValues(request, secret_key): now = datetime.datetime.now() today_dt = now.strftime("%d%m%Y") mystring = 'businessworld' new_string = today_dt + mystring #return HttpResponse(new_string) hash_object = hashlib.md5(new_string.encode()) my_secret_key = hash_object.hexdigest() if (my_secret_key == secret_key): recent_articles = Articles.objects.raw( "SELECT A.*, AU.*, AI.image_url, AI.photopath, AV.video_embed_code FROM articles A LEFT JOIN article_author A_A ON A.article_id = A_A.article_id LEFT JOIN author AU ON A_A.author_id = AU.author_id JOIN article_images AI ON AI.article_id = A.article_id LEFT JOIN article_video AV ON A.article_id = AV.article_id WHERE A.display_to_homepage = '1' AND AI.photopath !='' GROUP BY A.article_id ORDER BY A.article_published_date DESC LIMIT 8" ) recent_articles_json = [] for article in recent_articles: art_elem = {} art_elem['article_title'] = article.article_title art_elem['article_published_date'] = str( article.article_published_date) art_elem['article_id'] = article.article_id art_elem['photopath'] = article.photopath art_elem['video_type'] = article.video_type art_elem['important_article'] = article.important_article art_elem['absolute_url'] = article.get_absolute_url() recent_articles_json.append(art_elem) bwtv_articles = VideoMaster.objects.raw( "SELECT * FROM video_master ORDER BY video_id DESC LIMIT 6") bwtv_articles_json = [] for article in bwtv_articles: art_elem = {} art_elem['video_title'] = article.video_title art_elem['video_thumb_name'] = article.video_thumb_name art_elem['absolute_url'] = article.get_absolute_url() bwtv_articles_json.append(art_elem) recent_important_article = Articles.objects.raw( "SELECT A.*, AU.*, AI.image_url, AI.photopath FROM articles A LEFT JOIN article_author A_A ON A.article_id = A_A.article_id LEFT JOIN author AU ON A_A.author_id = AU.author_id LEFT JOIN article_images AI ON A.article_id = AI.article_id WHERE A.display_to_homepage = '1' AND A.important_article = '1' GROUP BY A.article_id ORDER BY A.article_published_date DESC LIMIT 1" ) recent_important_article_json = [] for article in recent_important_article: art_elem = {} art_elem['article_title'] = article.article_title art_elem['article_published_date'] = str( article.article_published_date) art_elem['article_id'] = article.article_id art_elem['photopath'] = article.photopath art_elem['video_type'] = article.video_type art_elem['article_summary'] = article.article_summary art_elem['author_name'] = article.author_name author_url = article.get_article_author_url() art_elem['author_url'] = author_url art_elem['absolute_url'] = article.get_absolute_url() recent_important_article_json.append(art_elem) recent_exclusive_article = Articles.objects.raw( "SELECT A.*, AI.image_url, AI.photopath FROM articles A LEFT JOIN article_images AI ON A.article_id = AI.article_id WHERE A.is_exclusive = '1' GROUP BY A.article_id ORDER BY A.article_published_date DESC LIMIT 4" ) recent_exclusive_article_json = [] for article in recent_exclusive_article: art_elem = {} art_elem['article_title'] = article.article_title art_elem['article_published_date'] = str( article.article_published_date) art_elem['article_id'] = article.article_id art_elem['photopath'] = article.photopath art_elem['video_type'] = article.video_type art_elem['article_summary'] = article.article_summary author_url = article.get_article_author_url() art_elem['author_url'] = author_url art_elem['absolute_url'] = article.get_absolute_url() recent_exclusive_article_json.append(art_elem) column_articles = Articles.objects.raw( "SELECT A.*, AU.*, AI.image_url, AI.photopath FROM articles A LEFT JOIN article_author A_A ON A.article_id = A_A.article_id LEFT JOIN author AU ON A_A.author_id = AU.author_id LEFT JOIN article_images AI ON A.article_id = AI.article_id WHERE A_A.author_type='4' GROUP BY A.article_id ORDER BY A.article_published_date DESC LIMIT 4" ) column_articles_json = [] for article in column_articles: art_elem = {} art_elem['article_title'] = article.article_title art_elem['article_published_date'] = str( article.article_published_date) art_elem['article_id'] = article.article_id art_elem['photopath'] = article.photopath art_elem['video_type'] = article.video_type art_elem['article_summary'] = article.article_summary art_elem['author_name'] = article.author_name author_url = article.get_article_author_url() art_elem['author_url'] = author_url art_elem['absolute_url'] = article.get_absolute_url() column_articles_json.append(art_elem) columnist = Author.objects.raw( "SELECT * FROM (SELECT AU.*, AR.article_published_date FROM author AU INNER JOIN article_author ARU ON AU.author_id = ARU.author_id INNER JOIN articles AR ON ARU.article_id = AR.article_id WHERE AU.author_type='4' ORDER BY AR.article_published_date DESC) AS tem GROUP BY tem.author_id ORDER BY article_published_date DESC LIMIT 9" ) columnist_json = [] for article in columnist: art_elem = {} art_elem['author_photo'] = article.author_photo art_elem['author_name'] = article.author_name art_elem['absolute_url'] = article.get_absolute_url() columnist_json.append(art_elem) sidebar_recent_articles = Articles.objects.raw( "SELECT A.*, AU.*, AI.image_url, AI.photopath, AV.video_embed_code FROM articles A LEFT JOIN article_author A_A ON A.article_id = A_A.article_id LEFT JOIN author AU ON A_A.author_id = AU.author_id JOIN article_images AI ON AI.article_id = A.article_id LEFT JOIN article_video AV ON A.article_id = AV.article_id WHERE A.display_to_homepage = '1' AND AI.photopath !='' GROUP BY A.article_id ORDER BY A.article_published_date DESC LIMIT 10" ) sidebar_recent_articles_json = [] for article in sidebar_recent_articles: art_elem = {} art_elem['article_title'] = article.article_title art_elem['article_published_date'] = str( article.article_published_date) art_elem['article_id'] = article.article_id art_elem['photopath'] = article.photopath art_elem['absolute_url'] = article.get_absolute_url() sidebar_recent_articles_json.append(art_elem) recent_articles_interview = Articles.objects.raw( "SELECT A.*, AU.*, AI.image_url, AI.photopath FROM articles A LEFT JOIN article_author A_A ON A.article_id = A_A.article_id LEFT JOIN author AU ON A_A.author_id = AU.author_id LEFT JOIN article_images AI ON A.article_id = AI.article_id WHERE A.display_to_homepage = '1' AND A.article_type = 3 AND AI.photopath !='' GROUP BY A.article_id ORDER BY A.article_published_date DESC LIMIT 1, 6" ) recent_articles_interview_json = [] for article in recent_articles_interview: art_elem = {} art_elem['article_title'] = article.article_title art_elem['article_published_date'] = str( article.article_published_date) art_elem['article_id'] = article.article_id art_elem['photopath'] = article.photopath art_elem['video_type'] = article.video_type art_elem['absolute_url'] = article.get_absolute_url() recent_articles_interview_json.append(art_elem) recent_important_article_interview = Articles.objects.raw( "SELECT A.*, AU.*, AI.image_url, AI.photopath FROM articles A LEFT JOIN article_author A_A ON A.article_id = A_A.article_id LEFT JOIN author AU ON A_A.author_id = AU.author_id LEFT JOIN article_images AI ON A.article_id = AI.article_id WHERE A.display_to_homepage = '1' AND A.article_type = 3 AND AI.photopath !='' GROUP BY A.article_id ORDER BY A.article_published_date DESC LIMIT 1" ) recent_important_article_interview_json = [] for article in recent_important_article_interview: art_elem = {} art_elem['article_title'] = article.article_title art_elem['article_published_date'] = str( article.article_published_date) art_elem['article_id'] = article.article_id art_elem['photopath'] = article.photopath art_elem['video_type'] = article.video_type art_elem['article_summary'] = article.article_summary art_elem['author_name'] = article.author_name author_url = article.get_article_author_url() art_elem['author_url'] = author_url art_elem['absolute_url'] = article.get_absolute_url() recent_important_article_interview_json.append(art_elem) category_jumlist = ChannelCategory.objects.filter(category_parent='0') category_jumlist_json = [] for article in category_jumlist: art_elem = {} art_elem['category_name'] = article.category_name art_elem['absolute_url'] = article.category_self_url() category_jumlist_json.append(art_elem) author_list_on_home_page = Author.objects.raw( "SELECT * FROM (SELECT AU.*, AR.article_published_date, NWS.newsletter_counts FROM author AU INNER JOIN article_author ARU ON AU.author_id = ARU.author_id INNER JOIN articles AR ON ARU.article_id = AR.article_id LEFT JOIN (SELECT author_newsletter_type_id, COUNT(author_newsletter_type_id) AS newsletter_counts FROM author_newsletter_Subscriber GROUP BY author_newsletter_type_id)NWS ON AU.author_id = NWS.author_newsletter_type_id WHERE AU.author_type='4' OR AU.author_type='3' ORDER BY AR.article_published_date DESC) AS tem GROUP BY tem.author_id ORDER BY article_published_date desc LIMIT 6" ) author_list_on_home_page_json = [] for article in author_list_on_home_page: art_elem = {} art_elem['author_name'] = article.author_name art_elem['newsletter_counts'] = str(article.newsletter_counts) art_elem['author_photo'] = article.author_photo art_elem['absolute_url'] = article.get_absolute_url() author_list_on_home_page_json.append(art_elem) bwtv_articles = Articles.objects.raw( "SELECT A.*, AC.*, AI.image_url, AI.photopath FROM articles A LEFT JOIN article_category AC ON A.article_id = AC.article_id LEFT JOIN article_images AI ON A.article_id = AI.article_id WHERE AC.category_id = '156' ORDER BY A.article_published_date DESC LIMIT 10" ) bwtv_articles_json = [] for article in bwtv_articles: art_elem = {} art_elem['video_title'] = article.video_title art_elem['article_published_date'] = str( article.article_published_date) art_elem['video_thumb_name'] = article.video_thumb_name art_elem['absolute_url'] = article.get_absolute_url() bwtv_articles_json.append(art_elem) magazine_image = Magazine.objects.raw( "SELECT *,YEAR(publish_date_m) as years FROM magazine ORDER BY publish_date_m DESC LIMIT 1" ) magazine_image_json = [] for article in magazine_image: art_elem = {} art_elem['description'] = article.description art_elem['imagepath'] = article.imagepath art_elem['story1_url'] = article.story1_url art_elem['story1_title'] = article.story1_title art_elem['story2_url'] = article.story2_url art_elem['story2_title'] = article.story2_title art_elem['story3_url'] = article.story3_url art_elem['story3_title'] = article.story3_title art_elem['story4_url'] = article.story4_url art_elem['story4_title'] = article.story4_title art_elem['story5_url'] = article.story5_url art_elem['story5_title'] = article.story5_title art_elem['flipbook_url'] = article.flipbook_url art_elem['absolute_url'] = article.get_absolute_url() art_elem['years'] = article.years magazine_image_json.append(art_elem) photoshoot_listing = PhotoShoot.objects.raw( "SELECT count(*) as counts, ps.*,psp.photo_shoot_photo_url, psp.photo_shoot_image_id , psp.photo_shoot_photo_name FROM photo_shoot_photos psp join photo_shoot ps on psp.photo_shoot_id=ps.photo_shoot_id group by psp.photo_shoot_id ORDER BY ps.photo_shoot_id DESC LIMIT 0,5" ) photoshoot_listing_json = [] for article in photoshoot_listing: art_elem = {} art_elem['photo_shoot_title'] = article.photo_shoot_title art_elem['photo_shoot_photo_name'] = article.photo_shoot_photo_name art_elem['counts'] = article.counts art_elem['absolute_url'] = article.get_absolute_url() photoshoot_listing_json.append(art_elem) #Rest of the site Sidebar data client = storage.Client() bucket = client.get_bucket('bwmedia') blob_homepage = bucket.get_blob( 'json-files/bwdiff/homepage_site_data.json') #print(blob_homepage.download_as_string()) blob_homepage.upload_from_string( json.dumps({ 'recent_articles': recent_articles_json, 'sidebar_recent_articles': sidebar_recent_articles_json, 'bwtv_articles': bwtv_articles_json, 'recent_important_article': recent_important_article_json, 'column_articles': column_articles_json, 'columnist': columnist_json, 'recent_articles_interview': recent_articles_interview_json, 'recent_important_article_interview': recent_important_article_interview_json, 'category_jumlist': category_jumlist_json, 'author_list_on_home_page': author_list_on_home_page_json, 'bw_corporate_movement': bw_corporate_movement_json, 'magazine_image': magazine_image_json, 'featured_boxs': featured_boxs_json, 'recent_exclusive_article': recent_exclusive_article_json, 'photoshoot_listing': photoshoot_listing_json, })) blob_sidebar = bucket.get_blob( 'json-files/bw-bwdiff/sidebar_site_data.json') blob_sidebar.upload_from_string( json.dumps({ 'sidebar_recent_articles': sidebar_recent_articles_json[:6], 'bwtv_articles': bwtv_articles_json[:6], 'category_jumlist': category_jumlist_json, })) feeds_bwcio = feedparser.parse( 'http://bwcio.businessworld.in/rss/all-article.xml') feeds_bws = feedparser.parse( 'http://bwsmartcities.businessworld.in/rss/channel-feed-articles.xml' ) feeds_bwh = feedparser.parse( 'http://businessworld.in/rss/latest-article.xml') feeds_bwd = feedparser.parse( 'http://bwdisrupt.businessworld.in/rss/channel-feed-articles.xml') feeds_ever = feedparser.parse( 'http://everythingexperiential.businessworld.in/rss/channel-feed-articles.xml' ) feeds_bwwh = feedparser.parse( 'http://bwwealth.businessworld.in/rss/all-article.xml') feeds_bma = feedparser.parse('http://www.digitalmarket.asia/feed/') feeds_bwcio_json = [] feeds_bws_json = [] feeds_bwh_json = [] feeds_bwd_json = [] feeds_ever_json = [] feeds_bwwh_json = [] feeds_bma_json = [] for entry in feeds_bwcio.entries: #return HttpResponse(entry.link) json_feed = {} json_feed['link'] = entry.link json_feed['title'] = entry.title feeds_bwcio_json.append(json_feed) for entry in feeds_bws.entries: json_feed = {} json_feed['link'] = entry.link json_feed['title'] = entry.title feeds_bws_json.append(json_feed) for entry in feeds_bwh.entries: json_feed = {} json_feed['link'] = entry.link json_feed['title'] = entry.title feeds_bwh_json.append(json_feed) for entry in feeds_bwd.entries: json_feed = {} json_feed['link'] = entry.link json_feed['title'] = entry.title feeds_bwd_json.append(json_feed) for entry in feeds_ever.entries: json_feed = {} json_feed['link'] = entry.link json_feed['title'] = entry.title feeds_ever_json.append(json_feed) for entry in feeds_bwwh.entries: json_feed = {} json_feed['link'] = entry.link json_feed['title'] = entry.title feeds_bwwh_json.append(json_feed) for entry in feeds_bma.entries: json_feed = {} json_feed['link'] = entry.link json_feed['title'] = entry.title feeds_bma_json.append(json_feed) blob_footer_community = bucket.get_blob( 'json-files/bw-bwdiff/footer_community_site_data.json') blob_footer_community.upload_from_string( json.dumps({ 'feeds_bwcio': feeds_bwcio_json, 'feeds_bws': feeds_bws_json, 'feeds_bwh': feeds_bwh_json, 'feeds_bwd': feeds_bwd_json, 'feeds_ever': feeds_ever_json, 'feeds_bwwh': feeds_bwwh_json, 'feeds_bma': feeds_bma_json, 'recent_exclusive_article': recent_exclusive_article_json, 'column_articles': column_articles_json, })) #Dow Jones XML generation #urlset = ET.Element('xml', version="1.0", encoding="UTF-8") nodes = ET.Element('nodes') html_parser = HTMLParser.HTMLParser() #50 article list without BW Online for yahoo and dow jones article_list = Articles.objects.raw( "SELECT A.* FROM articles A LEFT JOIN article_author A_A ON A.article_id = A_A.article_id WHERE A_A.author_type != 1 GROUP BY A.article_id ORDER BY A.article_published_date DESC LIMIT 50" ) for article in article_list: node = ET.SubElement(nodes, "node") ET.SubElement( node, "link" ).text = 'http://bwhotelier.businessworld.in/' + article.get_absolute_url( ) ET.SubElement(node, "title").text = html_parser.unescape( article.article_title) categories = article.get_article_category_listing() if len(list(categories)) > 0: categories_list = ET.SubElement(node, "categories") for category in categories: ET.SubElement(categories_list, "category").text = category ET.SubElement(node, "description").text = html_parser.unescape( article.article_summary) ET.SubElement(node, "author").text = html_parser.unescape( article.get_article_author_name()) article_desc = remove_tags(article.article_description, "p") ET.SubElement(node, "body").text = article_desc ET.SubElement(node, "post-date").text = str( article.article_published_date.strftime( '%a, %Y %b %d %H:%M:%S %Z')) tree = ET.tostring(nodes) tree = re.sub(r'<', '<', tree) tree = re.sub(r'>', '>', tree) out = open('static/xml/dow_jones_article.xml', 'w+') out.write(tree) out.close() return HttpResponse(json.dumps({'result': 'completed'})) else: return HttpResponseRedirect("/")
"""Default variable filters."""
def xt(cls, response): status = remove_tags( response.xpath(cls.XPATH).extract()[0], 'em img p') status = status.replace('Status: ', '') return status
def xt(cls, response): mandates_raw = response.xpath(cls.XPATH).extract() mandates = [] for mandate in mandates_raw: mandate = _clean(remove_tags(mandate, 'li')) if "<div" in mandate and "</div>" in mandate: mandate = _clean(remove_tags( Selector(text=mandate).xpath("//div").extract()[0], 'div')) function = mandate.split(u'<br>')[0].split(',')[0] party = mandate.split(u'<br>')[0].split(',')[1] if ',' in mandate.split(u'<br />')[0] else '' llp_raw = re.match( '^.*\((.*)\. GP\).*$', function ) function = re.sub( '\((.*)\. GP\)','', function ).strip() m_llp_roman_begin = \ m_llp_roman_end = \ llp_raw.group(1) if llp_raw else '' if u'–' in m_llp_roman_begin: m_llp_roman_begin,m_llp_roman_end = m_llp_roman_begin.split(u'–') for llp in range(roman.fromRoman(m_llp_roman_begin.strip('. ')), roman.fromRoman(m_llp_roman_end.strip('. '))+1 ) if m_llp_roman_begin else [None]: llp_roman = roman.toRoman(llp) if llp else None # Start Date try: start_date = _clean( mandate.split('<br>')[1].split(u'\u2013')[0]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() except: logger.error( u"Failed to parse mandate start date: {}".format(start_date)) start_date = None # End Date try: end_date = mandate.split( '<br>')[1].split(u'\u2013') if len(end_date) > 1 and end_date[1]: end_date = datetime.datetime.strptime( _clean(end_date[1]), "%d.%m.%Y").date() else: end_date = None except: logger.error( u"Failed to parse mandate end date: {}".format(end_date)) end_date = None mandates.append({ 'function': function, 'party': _clean(party), 'start_date': start_date, 'end_date': end_date, 'llp': llp, 'llp_roman': llp_roman, }) return mandates
def stripy(node): cleaner(node) return remove_tags(tostring(node, encoding='utf-8', method="html", pretty_print=True).decode('utf-8'), 'div')
def save(self, *args, **kwargs): if (self.text is not None): self.text = remove_tags(self.text, "font span") super(Recipe, self).save(*args, **kwargs)
def remove_part_tags(html,tags): return remove_tags(html, tags)
def __unicode__(self): if self.title: return remove_tags(self.title, 'a') else: return self.title
def getContent(self,obj): #First, we have to remove the preview tag. return remove_tags(obj.content,'preview')
def save(self, *args, **kwargs): self.body = remove_tags(self.body, "font span") super(Post, self).save(*args, **kwargs)
def admin_thumbnail(self): thumbnail = remove_tags(self.content, "p br") return u"%s" % thumbnail