class TreeBuilder(_base.TreeBuilder): def documentClass(self): self.soup = BeautifulSoup("") return Element(self.soup, self.soup) def insertDoctype(self, name, publicId, systemId): self.soup.insert(0, Declaration(name)) def elementClass(self, name): return Element(Tag(self.soup, name), self.soup) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): self.soup = BeautifulSoup("") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup) def appendChild(self, node): self.soup.insert(len(self.soup.contents), node.element) def testSerializer(self, element): return testSerializer(element) def getDocument(self): return self.soup def getFragment(self): return _base.TreeBuilder.getFragment(self).element
def extractLinks(postSoup): linkSoup = BeautifulSoup() for tag in postSoup.findAll("a"): if "href" in tag: linkSoup.insert(len(linkSoup), tag["href"]) return linkSoup.renderContents()
def geo_term_extract(self, desc): data = values ={ 'maxRows':'1', 'fuzzy':'1', 'country':'EE', 'featureClass':'P', 'operator':'OR', 'username':self.geonames_user, 'q':desc.encode('utf-8')} data=urllib.urlencode(values) link = u"http://api.geonames.org/search" xmldata = urllib.urlopen(link, data) soup = BeautifulSoup(xmldata) # print soup.prettify() lng = '0' lat = '0' if len(soup.findAll("lat")) > 0: lng = soup.findAll("lng")[0].text lat = soup.findAll("lat")[0].text lat_f = float(lat) lng_f = float(lng) lat = '%.5f' % ((lat_f * 10000 + random.uniform(1,80))/10000) lng = '%.5f' % ((lng_f * 10000 + random.uniform(1,80))/10000) soup2 = BeautifulSoup() tag1 = Tag(soup2, "Point") tag2 = Tag(soup2, "coordinates") soup2.insert(0, tag1) tag1.insert(0, tag2) text = NavigableString(lng + "," + lat) tag2.insert(0, text) # print soup2 result = (soup2.__str__()).encode("utf-8") return [result, lat, lng]
def setup_source(self): source_path = vfs.join('special://profile/', 'sources.xml') try: soup = vfs.read_file(source_path, soup=True) except: soup = BeautifulSoup() sources_tag = Tag(soup, "sources") soup.insert(0, sources_tag) if soup.find("video") == None: sources = soup.find("sources") if not sources: return video_tag = Tag(soup, "video") sources.insert(0, video_tag) video = soup.find("video") if len(soup.findAll(text="PVR Recordings")) < 1: pvr_source_tag = Tag(soup, "source") pvr_name_tag = Tag(soup, "name") pvr_name_tag.insert(0, "PVR Recordings") PVR_PATH_tag = Tag(soup, "path") PVR_PATH_tag['pathversion'] = 1 PVR_PATH_tag.insert(0, "pvr://recordings/active/Default/") pvr_source_tag.insert(0, pvr_name_tag) pvr_source_tag.insert(1, PVR_PATH_tag) video.insert(2, pvr_source_tag) string = "" for i in soup: string = string + str(i) vfs.write_file(source_path, string)
def merge_related_elems(self): """ search through sibling for related contents """ article = Soup('<div></div>') index = 0 threshold = max(10, self.top_candidate[READABILITY] * 0.2) siblings = [elem for elem in self.top_candidate.parent.contents] for elem in siblings: append = False if elem is self.top_candidate: append = True elif _has_attr(elem, READABILITY) and elem[READABILITY] >= threshold: append = True elif is_navigable_string(elem) or elem.name == 'p': text = _inner_text(elem) text_length = len(text) link_density = get_link_density(elem) if text_length >= 80 and link_density < 0.25: append = True elif text_length < 80 and link_density < 1e-5 and re.search(r'\.( |$)', text): append = True if append: _debug("sibling found: ", _attr(elem, 'id'), ' ', _attr(elem, 'class')) article.insert(index, elem) index += 1 self.article = article
def __init__(self, hl=None): soup = BeautifulSoup() doc = Tag(soup, 'DOC') docid = Tag(soup, 'DOCID') doctype = Tag(soup, 'DOCTYPE') datetime = Tag(soup, 'DATETIME') body = Tag(soup, 'BODY') headline = Tag(soup, 'HEADLINE') text = Tag(soup, 'TEXT') soup.insert(0, doc) doc.insert(0, docid) doc.insert(1, doctype) doc.insert(2, datetime) doc.insert(3, body) body.insert(0, headline) body.insert(1, text) doctype.insert(0, NavigableString(" BLOG TEXT ")) doctype['SOURCE'] = "blog" self.soup = soup self.docid = docid self.datetime = datetime self.headline = headline self.text = text self.initialPost = True if hl: self.setHeadline(hl)
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.cssfilter import legacy_s3_url def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and c.site.images.has_key(name): url = c.site.images[name] url = legacy_s3_url(url, c.site) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, g.domain, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def content_absolute_links(content, image=None): from django.contrib.sites.models import Site current_site = Site.objects.get(pk=settings.SITE_ID) def abs_url(url): parsed = urlparse.urlparse(url) if parsed.netloc == parsed.scheme == '': url = urlparse.urljoin('http://{0}'.format(current_site.domain), url) return url soup = BeautifulSoup(content) if image: img = Tag(soup, 'img', [('src', image)]) soup.insert(0, img) for link in soup.findAll('a'): link['href'] = abs_url(link['href']) for link in soup.findAll('img'): link['src'] = abs_url(link['src']) return unicode(soup)
def __init__(self, hl=None): soup = BeautifulSoup() doc = Tag(soup, "DOC") docid = Tag(soup, "DOCID") doctype = Tag(soup, "DOCTYPE") datetime = Tag(soup, "DATETIME") body = Tag(soup, "BODY") headline = Tag(soup, "HEADLINE") text = Tag(soup, "TEXT") soup.insert(0, doc) doc.insert(0, docid) doc.insert(1, doctype) doc.insert(2, datetime) doc.insert(3, body) body.insert(0, headline) body.insert(1, text) doctype.insert(0, NavigableString(" USENET TEXT ")) doctype["SOURCE"] = "usenet" self.soup = soup self.docid = docid self.datetime = datetime self.headline = headline self.text = text self.initialPost = True if hl: self.setHeadline(hl)
def AllCategories(request): print 'allcat' x = BeautifulSoup() #root = Tag(x,'ul', [('class', "tree"), ( 'id', "tree")]) #x.insert(0,root) AllCategories = RECategory.objects.filter(parent__isnull=True).order_by('-number') AllAnswered = {} #в logs добавляем только самые поздние по дате RELog for log in RELog.objects.filter(user=request.user).order_by('-date'): if not log.category_id in AllAnswered: AllAnswered[log.category_id] = {} if not log.type_log in AllAnswered[log.category_id]: AllAnswered[log.category_id][log.type_log] = log for category in AllCategories: print category.id nt = Tag(x,'li', [("id", str(category.id))]) log = AllAnswered.get(category.id) rating = '' if log: log = log.get(5) if log : rating = 'Оценка: ' + str(log.rating) div = Tag(x,'div') div.string = rating div["class"] = "rating" #div["style"] = "width: 150px; float: right;" nt.insert(0, div) if category.is_3d: isDDD = "Есть"; else: isDDD = "Нет"; div = Tag(x,'div') div.string = isDDD div["class"] = "is3d" #div["style"] = "margin-right: 0px;width: 110px; float: right;" nt.insert(0, div) div = Tag(x,'div') div["class"] = "demo" #div["style"] = "margin-right: 0px;width: 110px; float: right;" div.string = str(category.type_category) nt.insert(0, div) div = Tag(x,'div') div.string = category.name nt.insert(0, div) x.insert(0,nt) recurseCategories(category, nt, x, AllAnswered) res = x.prettify() #print res print 'endallcat' return res
def extractCode(postSoup): """ extract and clean up the code from a soup-ed post string, return a set of tokens""" codes = BeautifulSoup() for tag in postSoup.findAll("code"): codes.insert(len(codes), tag) tag.hidden = True if tag.string: tag.string = tag.string + u"\n" return codes.renderContents()
def save(self): soup = BeautifulSoup() root_tag = Tag(soup, 'Task') soup.insert(0, root_tag) i = 0 try: job_tag = Tag(soup, 'Job') job_tag.insert(0, NavigableString('%s' % self.owner_id)) root_tag.insert(i, job_tag) i = i+1 except AttributeError: raise ValueError("You must provide job id.") try: id_tag = Tag(soup, 'TaskID') id_tag.insert(0, NavigableString('%d' % self.id)) root_tag.insert(i, id_tag) i = i+1 except AttributeError: raise ValueError("You must provide task id.") try: if self.name: label_tag = Tag(soup, 'Label') label_tag.insert(0, NavigableString(self.name)) root_tag.insert(i, label_tag) i = i+1 except AttributeError: pass try: if self.description: description_tag = Tag(soup, 'Description') description_tag.insert(0, NavigableString(self.description)) root_tag.insert(i, description_tag) i = i+1 except AttributeError: pass try: if self.estimated_minutes: estimated_minutes_tag = Tag(soup, 'EstimatedMinutes') estimated_minutes_tag.insert(0, NavigableString('%d' % self.estimated_minutes)) root_tag.insert(i, estimated_minutes_tag) i = i+1 except AttributeError: pass print soup response = rest_client.Client("").POST(self.post, str(soup)) return Task(xml=response.content)
def wikimarkdown(text, include_toc=True, target=None): from v1.lib.template_helpers import make_url_protocol_relative # this hard codes the stylesheet page for now, but should be parameterized # in the future to allow per-page images. from v1.models.wiki import ImagesByWikiPage from v1.lib.utils import UrlParser from v1.lib.template_helpers import add_sr page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet") def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and name in page_images: url = page_images[name] url = make_url_protocol_relative(url) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] def add_ext_to_link(link): url = UrlParser(link.get('href')) if url.is_verbify_url(): link['href'] = add_sr(link.get('href'), sr_path=False) if c.render_style == 'compact': links = soup.findAll('a') [add_ext_to_link(a) for a in links] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def save(self): soup = BeautifulSoup() root_tag = Tag(soup, 'Note') soup.insert(0, root_tag) i = 0 try: job_tag = Tag(soup, 'Job') job_tag.insert(0, NavigableString('%s' % self.owner_id)) root_tag.insert(i, job_tag) i = i+1 except AttributeError: raise ValueError("You must provide job id.") try: title_tag = Tag(soup, 'Title') title_tag.insert(0, NavigableString(self.title)) root_tag.insert(i, title_tag) i = i+1 except AttributeError: raise ValueError("You must provide note's title.") try: text_tag = Tag(soup, 'Text') text_tag.insert(0, NavigableString(self.text)) root_tag.insert(i, text_tag) i = i+1 except AttributeError: raise ValueError("You must provide note's text.") try: if self.folder: folder_tag = Tag(soup, 'Folder') folder_tag.insert(0, NavigableString(self.folder)) root_tag.insert(i, folder_tag) i = i+1 except AttributeError: pass try: if self.public: public_tag = Tag(soup, 'Public') public_tag.insert(0, NavigableString(str(self.public).lower())) root_tag.insert(i, public_tag) i = i+1 except AttributeError: pass response = rest_client.Client("").POST(self.post, str(soup)) return Note(xml=response.content)
def userlist(request): x = BeautifulSoup() root = Tag(x,'root') x.insert(0,root) for u in models.Group.objects.get(name='Курсанты').user_set.all(): root.insert(0,'\n') root.insert(0,Tag(x,'user',[ ('uid',str(u.id)), ('username',u.username), ('first_name',u.first_name), ('last_name',u.last_name), ])) return HttpResponse(x)
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.template_helpers import make_url_protocol_relative # this hard codes the stylesheet page for now, but should be parameterized # in the future to allow per-page images. from r2.models.wiki import ImagesByWikiPage from r2.lib.utils import UrlParser from r2.lib.template_helpers import add_sr page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet") def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and name in page_images: url = page_images[name] url = make_url_protocol_relative(url) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] def add_ext_to_link(link): url = UrlParser(link.get('href')) if url.is_reddit_url(): link['href'] = add_sr(link.get('href'), sr_path=False) if c.render_style == 'compact': links = soup.findAll('a') [add_ext_to_link(a) for a in links] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def delete(self): soup = BeautifulSoup() client_tag = Tag(soup, 'Client') soup.insert(0, client_tag) try: id_tag = Tag(soup, 'ID') id_tag.insert(0, NavigableString('%d' % self.id)) client_tag.insert(0, id_tag) except AttributeError: raise ValueError("You must have id for delete operation.") response = rest_client.Client("").POST(self.delete_url, str(soup)) soup = BeautifulStoneSoup(response.content) if soup.status and soup.status.contents[0].lower() == 'error': raise ResponseStatusError(soup.errordescription.contents[0])
def find_wanted_content(self, soup): """ Finds wanted elements. """ assert isinstance(soup, BeautifulSoup) new_soup = BeautifulSoup() for selector in self.wanted_tags_selector: tag = soup.find(**selector.soup) self.log.info('Looking for element %s...' % selector.out()) if tag: self.log.info('found') new_soup.insert(0, tag) else: self.log.info('NOT FOUND') return new_soup
def reddit(post): print post.title print post.content soup = BeautifulSoup(post.content) imgur = soup.find('a', href=re.compile('imgur')) if imgur: src = imgur['href'] else: print "No imgur" qkme = soup.find('a', href=re.compile('qkme')) if qkme: src = qkme['href'] if urlparse.urlparse(src).hostname == "qkme.me": src = 'http://i.qkme.me/'+src[15:].split('?')[0] else: print "No meme neither" tumblr = soup.find('a', href=re.compile('tumblr')) if tumblr: src = tumblr['href'] else: print "No tumblr neither" src = None if src: url = urllib.urlopen(src).getcode() if url == 404: url = urllib.urlopen(src+'.jpg').getcode() if url == 200: src += '.jpg' else: url = urllib.urlopen(src+'.gif').getcode() if url == 200: src += '.gif' if url == 200: print "Embedding..." img = Tag(soup, "img", [("src", src)]) soup.insert(0, img) thumb = soup.find('img', src=re.compile('thumbs.redditmedia.com/')) if thumb: thumb.extract() print 'remove thumbs' post.summary = soup.renderContents() post.save() print post.summary #
def body_insertion(content, insertion, end=False): """Insert an HTML content into the body HTML node""" insertion = BeautifulSoup(insertion) soup = BeautifulSoup(content) if soup.body and end: soup.body.append(insertion) elif soup.body: soup.body.insert(0, insertion) elif not soup.body and end: soup.append(insertion) elif not soup.body: soup.insert(0, insertion) if USE_PRETTIFY: return soup.prettify() else: return soup.renderContents()
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.cssfilter import legacy_s3_url nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, g.domain ) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
class TreeBuilder(_base.TreeBuilder): def __init__(self, namespaceHTMLElements): if namespaceHTMLElements: warnings.warn( "BeautifulSoup cannot represent elements in any namespace", DataLossWarning) _base.TreeBuilder.__init__(self, namespaceHTMLElements) def documentClass(self): self.soup = BeautifulSoup("") return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] if publicId: self.soup.insert( 0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\"" % (name, publicId, systemId or ""))) elif systemId: self.soup.insert( 0, Declaration("DOCTYPE %s SYSTEM \"%s\"" % (name, systemId))) else: self.soup.insert(0, Declaration("DOCTYPE %s" % name)) def elementClass(self, name, namespace): if namespace is not None: warnings.warn( "BeautifulSoup cannot represent elements in any namespace", DataLossWarning) return Element(Tag(self.soup, name), self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): self.soup = BeautifulSoup("") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): self.soup.insert(len(self.soup.contents), node.element) def testSerializer(self, element): return testSerializer(element) def getDocument(self): return self.soup def getFragment(self): return _base.TreeBuilder.getFragment(self).element
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.template_helpers import s3_https_if_secure # this hard codes the stylesheet page for now, but should be parameterized # in the future to allow per-page images. from r2.models.wiki import ImagesByWikiPage page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet") def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and name in page_images: url = page_images[name] url = s3_https_if_secure(url) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def ConvertToTestHtml(quest): types = quest.type titles = quest.text quests_ids = [quest.id] answers = RETestAnswer.objects.filter(question__id__in=quests_ids) newbs = BeautifulSoup() pNode = Tag(newbs, 'p') newbs.insert(0,pNode) if quest.img: print 'Image!!!' print quest.img.url imageNode = Tag(newbs, 'image', [('src', quest.img.url)]) newbs.insert(0,imageNode) TitleNode = Tag(newbs, 'p') TitleNode.string = titles newbs.insert(0,TitleNode) i = 0 if types != 1: for answer in answers: radioname = 'ans' + str(i) nt = Tag(newbs,'input', [('type', 'radio'), ('type', radioname), ('name', 'answerradio'), ('value', str(answer.is_correct))]) nt.string = answer.name pNode.insert(len(pNode.contents), nt) pNode.insert(len(pNode.contents), Tag(newbs, 'br')) else: for answer in answers: radioname = 'ans' + str(i) nt = Tag(newbs,'input', [('type', 'text'), ('name', 'answertext'),('ans', answer.name)]) pNode.insert(len(pNode.contents), nt) pNode.insert(len(pNode.contents), Tag(newbs, 'br')) return newbs.prettify()
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.template_helpers import media_https_if_secure # this hard codes the stylesheet page for now, but should be parameterized # in the future to allow per-page images. from r2.models.wiki import ImagesByWikiPage page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet") def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and name in page_images: url = page_images[name] url = media_https_if_secure(url) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def get_malott_menu(today): url = "http://www.scrippscollege.edu/students/dining-services/index.php" resp = requests.get(url) soup = BeautifulSoup(resp.content) head = BeautifulSoup("<thead><tr><td colspan=3>Malott Commons</td></tr></thead>") target = soup.find("div", {"id": "right_column_content"}) target.extract() meals = [] for meal in target.findAll("ul"): meal.extract() meals += [meal] labels = [] for title in target.findAll("p"): title.extract() labels += [title] final_table = BeautifulSoup() table = Tag(final_table, "table") final_table.insert(0, table) table.insert(0, head) table["class"] = "mealtable" for meal in meals: tr = Tag(final_table, "tr") td = Tag(final_table, "td") tr.insert(0, td) td["class"] = "mealtime" td.contents = labels[1].contents table.insert(len(table.contents) - 1, tr) labels = labels[1:] for food in meal.findAll("li"): tr = Tag(final_table, "tr") td = Tag(final_table, "td") tr.insert(0, td) td.contents = food.contents table.insert(len(table.contents) - 1, tr) return final_table.prettify()
def reddit(value): soup = BeautifulSoup(value) imgur = soup.find("a", href=re.compile("imgur")) if imgur: src = imgur["href"] else: qkme = soup.find("a", href=re.compile("qkme")) if qkme: src = qkme["href"] if urlparse.urlparse(src).hostname == "qkme.me": src = "http://i.qkme.me/" + src[15:].split("?")[0] else: tumblr = soup.find("a", href=re.compile("tumblr")) if tumblr: src = tumblr["href"] else: src = None if src: url = urllib.urlopen(src).getcode() if url == 404: url = urllib.urlopen(src + ".jpg").getcode() if url == 200: src += ".jpg" else: url = urllib.urlopen(src + ".gif").getcode() if url == 200: src += ".gif" if url == 200: img = Tag(soup, "img", [("src", src)]) soup.insert(0, img) thumb = soup.find("img", src=re.compile("thumbs.redditmedia.com/")) if thumb: thumb.extract() print "remove thumbs" return soup.renderContents()
class TreeBuilder(_base.TreeBuilder): def __init__(self, namespaceHTMLElements): if namespaceHTMLElements: warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) _base.TreeBuilder.__init__(self, namespaceHTMLElements) def documentClass(self): self.soup = BeautifulSoup("") return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] if publicId: self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or ""))) elif systemId: self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""% (name, systemId))) else: self.soup.insert(0, Declaration("DOCTYPE %s"%name)) def elementClass(self, name, namespace): if namespace is not None: warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) return Element(Tag(self.soup, name), self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): self.soup = BeautifulSoup("") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): self.soup.insert(len(self.soup.contents), node.element) def testSerializer(self, element): return testSerializer(element) def getDocument(self): return self.soup def getFragment(self): return _base.TreeBuilder.getFragment(self).element
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.cssfilter import legacy_s3_url def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and c.site.images.has_key(name): url = c.site.images[name] url = legacy_s3_url(url, c.site) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def geo_term_extract(self, desc): data = values = { 'maxRows': '1', 'fuzzy': '1', 'country': 'EE', 'featureClass': 'P', 'operator': 'OR', 'username': self.geonames_user, 'q': desc.encode('utf-8') } data = urllib.urlencode(values) link = u"http://api.geonames.org/search" xmldata = urllib.urlopen(link, data) soup = BeautifulSoup(xmldata) # print soup.prettify() lng = '0' lat = '0' if len(soup.findAll("lat")) > 0: lng = soup.findAll("lng")[0].text lat = soup.findAll("lat")[0].text lat_f = float(lat) lng_f = float(lng) lat = '%.5f' % ((lat_f * 10000 + random.uniform(1, 80)) / 10000) lng = '%.5f' % ((lng_f * 10000 + random.uniform(1, 80)) / 10000) soup2 = BeautifulSoup() tag1 = Tag(soup2, "Point") tag2 = Tag(soup2, "coordinates") soup2.insert(0, tag1) tag1.insert(0, tag2) text = NavigableString(lng + "," + lat) tag2.insert(0, text) # print soup2 result = (soup2.__str__()).encode("utf-8") return [result, lat, lng]
def process(self, items): text = "\n".join(self.publish(item, level) for (item, level) in items) soup = BeautifulSoup(text) normalizer = getUtility(IURLNormalizer).normalize stack = [{'children': [], 'level': 0}] headings = soup.findAll(('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) for index, heading in enumerate(headings): level = int(heading.name[1]) hid = 'section-' + normalizer(heading.string) + '-%d' % (index + 1) title = u'' for string in heading.recursiveChildGenerator(): if isinstance(string, unicode): title += string.lstrip('123456789. ').strip() # Remove trivial headings if not title: heading.extract() continue entry = { 'title': title, 'id': hid, 'children': [], 'level': level, } i = 0 while level <= stack[-1]['level']: stack.pop() i += 1 stack[-1]['children'].append(entry) stack.append(entry) heading['id'] = hid if level == 1: heading.name = 'h2' heading['class'] = 'documentFirstHeading' # Make sure we start with a heading (default to 'own'). for child in soup.recursiveChildGenerator(): if isinstance(child, unicode): if child.strip('\n '): hid = 'section-0' title = self.context.Title().decode('utf-8') soup.insert(0, '<h2 id="%s">%s</h2>' % (hid, title)) # stack[0]['children'].insert( # 0, {'title': title, # 'id': hid, # 'children': [], # 'level': 2, # }) break elif child.name.startswith('h'): break while len(stack[0]['children']) == 1: stack[0] = stack[0]['children'].pop() return soup, stack[0]['children']
class Toc: def __init__(self, title, options): # defaults self.infile = sys.stdin self.outfile = sys.stdout self.tag_names = ['h2', 'h3'] self.toc_id = 'auto_toc' self.name_prefix = 'section' self.title = title # options for option, value in options: if option in ('-h', '--help'): usage() sys.exit() elif option in ('-t', '--tags'): self.tag_names = value.split(',') elif option in ('-i', '--infile'): self.infile = open(value, 'r') elif option in ('-o', '--outfile'): self.outfile = open(value, 'w') # process the html, create toc and print self.get_tags() if self.tag_list: self.id_tags() self.create_toc() self.output() # clean up self.infile.close() self.outfile.close() def get_tags(self): self.soup = BeautifulSoup(self.infile.read()) # check if there is an existing toc toc = self.soup.findAll(id=self.toc_id) for tag in toc: tag.extract() # check which of the mentioned tags are present tag_names = [] for tag_name in self.tag_names: tag_list = self.soup.findAll(tag_name) if tag_list: tag_names.append(tag_name) if len(tag_names) >= 2: break self.tag_names = tag_names # get tags self.tag_list = self.soup.findAll(self.tag_names) if self.tag_names else [] def id_tags(self): counts = [] self.toc_list = [] for item in self.tag_names: counts.append(0) for tag in self.tag_list: reset = False depth = 0 for index, tag_name in enumerate(self.tag_names): if reset == True: counts[index] = 0 if tag.name == tag_name: depth = index counts[index] += 1 reset = True name = self.name_prefix for count in counts: if count == 0: break name = '%(name)s_%(count)i' % { 'name': name, 'count': count } tag['id'] = name self.toc_list.append({ 'depth': depth, 'id': name, 'title': tag.text }) def create_toc(self): # lists will hold the last ol/ul elements at each depth lists = [] # setup the toc container toc = Tag(self.soup, 'div') toc['id'] = self.toc_id last_li = toc header = Tag(self.soup, 'h2') header_title = NavigableString('Contents') header.append(header_title) toc.append(header) for toc_item in self.toc_list: depth = toc_item['depth'] if len(lists) == depth: # this is the first time we're at this depth list_el = Tag(self.soup, 'ol') lists.append(list_el) last_li.append(list_el) elif depth > old_depth: # this is a new sub-tree list_el = Tag(self.soup, 'ol') lists[depth] = list_el last_li.append(list_el) old_depth = depth # set up the new item li = Tag(self.soup, 'li') a = Tag(self.soup, 'a') title = NavigableString(toc_item['title']) a.append(title) a['href'] = '#%(id)s' % { 'id': toc_item['id'] } li.append(a) lists[depth].append(li) last_li = li # insert the toc at the top of the html self.soup.insert(0, toc) def output(self): prefix = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>%(title)s</title> </head> <body> """ % { 'title': self.title } suffix = """ </body> </html> """ self.outfile.write(prefix) self.outfile.write(self.soup.prettify()) self.outfile.write(suffix)
class XMLSettings(object): """Saves Settings in XML file""" # ------------------------------------------------------------------------------------- # Attributes # Default Settings Header HEADER = """<?xml version="1.0" encoding="UTF-8"?>\n""" HEADER_SETTINGS = """<{0}> </{0}>""" PAT_FORMAT = re.compile(r">\s*<") # White spaces PAT_FORMAT_2 = re.compile( r"<([a-z0-9]*)>\s*<(/\1)>") # New lines on elements # ------------------------------------------------------------------------------------- def filepath(): doc = "The filepath property to the path" def fget(self): return self._filepath def fset(self, value): self._filepath = value return locals() filepath = property(**filepath()) # ------------------------------------------------------------------------------------- def cache(): doc = "Cache property - do not reload file on access if true" def fget(self): return self._cache def fset(self, value): self._cache = value return locals() cache = property(**cache()) # ------------------------------------------------------------------------------------- # Private # ------------------------------------------------------------------------------------- def __init__(self, filepath, root=None, cache=False): super(XMLSettings, self).__init__() self.root = root self.filepath = filepath self.cache = cache if os.path.isfile(self.filepath): self._soup = BeautifulSoup(open(self.filepath)) if self.root is None: self.root = self._soup.first().name else: if self.root is None: self.root = "settings" self.HEADER_SETTINGS = self.HEADER_SETTINGS.format(self.root) if not os.path.isfile(self.filepath): self._soup = BeautifulSoup(self.HEADER + self.HEADER_SETTINGS) # ------------------------------------------------------------------------------------- def __len__(self): """docstring for __len__""" root = self._soup.find(self.root) if root is None: return 0 return len(root.findAll(recursive=False)) # ------------------------------------------------------------------------------------- def __iter__(self): """docstring for __len__""" root = self._soup.find(self.root) if root is None: raise StopIteration for element in root.findAll(recursive=False): yield (element.name, dict(element.attrs)) # ------------------------------------------------------------------------------------- def _set_element(self, root, tagname, text=None, attr=None): """Creates if not available an element at the soup root element :return: tag object or None :rtype: Tag """ # Add Topic if not available if attr is None: if root.find(re.compile(tagname + "$", re.I)) is None: new_tag = Tag(self._soup, tagname) root.insert(0, new_tag) else: if root.find(re.compile(tagname + "$", re.I), attr) is None: new_tag = Tag(self._soup, tagname, attr.items()) root.insert(0, new_tag) settings = self._soup.find(self.root) tag = settings.find(re.compile(tagname + "$", re.I)) # Something to insert if tag is not None and text is not None: if tag.text.strip() == "": tag.insert(0, NavigableString(text)) else: tag.contents[0].replaceWith(text) return tag # ------------------------------------------------------------------------------------- def _set(self, topic, key, value, topic_attr=None): """Set key and value at topic :return: success status :rtype: bool""" # In case it is an empty document if not unicode(self._soup).strip().startswith("<?xml"): self._soup.insert(0, NavigableString(self.HEADER)) # In case settings root is not defined settings = self._soup.find(self.root) if settings is None: self._soup.insert(1, Tag(self._soup, self.root)) settings = self._soup.find(self.root) # Add Topic topic_tag = self._set_element(settings, topic.lower(), attr=topic_attr) if topic_tag is None: return False # Add key and value key_tag = self._set_element(topic_tag, key.lower(), escape(value)) # Add "" since XML may introduce whitespaces. #key_tag = self._set_element(topic_tag, key, '"{0}"'.format(value)) return key_tag is not None # ------------------------------------------------------------------------------------- def _get(self, topic, key, topic_attr=None): """Get key at topic :return: success status :rtype: bool""" # In case settings root is not defined settings = self._soup.find(self.root) if settings is None: return None if topic_attr is None: topic_tag = settings.find(re.compile(topic + "$", re.I)) else: topic_tag = settings.find(re.compile(topic + "$", re.I), topic_attr) if topic_tag is None: return None key_tag = topic_tag.find(re.compile(key + "$", re.I)) if key_tag is None or len(key_tag.contents) < 1: return None value = unescape(key_tag.contents[0]).strip() #if value.startswith('"') and value.endswith('"'): # value = value.strip('"') return value # ------------------------------------------------------------------------------------- def _save(self, filepath=None): """Save the File""" if filepath is None: filepath = self.filepath with open(filepath, 'w') as f: # For the newline make sure that content is escaped pretty_content = self._soup.renderContents() pretty_content = self.PAT_FORMAT.sub(">\\n<", pretty_content) pretty_content = self.PAT_FORMAT_2.sub("<\\1><\\2>", pretty_content) f.write(pretty_content) # ------------------------------------------------------------------------------------- # Public # ------------------------------------------------------------------------------------- def set(self, topic, key, value, topic_attr=None): """Set key and value at topic :return: success status :rtype: bool""" # Won't even bother if "<" in topic or ">" in topic: return False if "<" in key or ">" in key: return False ret = self._set(topic, key, value, topic_attr=topic_attr) if ret == True: self._save() return ret # ------------------------------------------------------------------------------------- def get(self, topic, key, default_value=None, create=False, topic_attr=None): """Get key at topic :return: success status :rtype: bool""" if not os.path.isfile(self.filepath): return default_value # Won't even bother if "<" in topic or ">" in topic: return default_value if "<" in key or ">" in key: return default_value # Only reload if not cached if not self.cache: self._soup = BeautifulSoup(open(self.filepath)) ret = self._get(topic, key, topic_attr=topic_attr) if ret is None: if create: self.set(topic, key, default_value, topic_attr=topic_attr) return default_value return ret # ------------------------------------------------------------------------------------- def remove(self, topic, key=None): """Remove a complete topic or key from topic""" if not os.path.isfile(self.filepath): return False if "<" in topic or ">" in topic: return False if key is not None and ("<" in key or ">" in key): return False # Only reload if not cached if not self.cache: self._soup = BeautifulSoup(open(self.filepath)) # In case settings root is not defined settings = self._soup.find(self.root) if settings is None: return False topic_tag = settings.find(re.compile(topic + "$", re.I)) if topic_tag is None: return False # Delete the whole topic if key is None: topic_tag.extract() else: # Delete only key key_tag = topic_tag.find(re.compile(key + "$", re.I)) if key_tag is None: return False key_tag.extract() self._save() return True # ------------------------------------------------------------------------------------- def findall(self, topic, key, attr="name"): """docstring for finall""" entries = {} for name, attrs in sorted(self): if name.lower() == topic and attr in attrs: entries[attrs[attr]] = self.get(topic, key, topic_attr={attr: attrs[attr]}) return entries
def writeFile(self, cate): count = 0 for cm in self.commentInfo: commentData = "" soupComment = BeautifulSoup(commentData) isOk = False productName = cm['title'].encode('utf-8') linkProduct = "http://www.thegioididong.com" + cm['href'] print linkProduct productData = urllib2.urlopen(linkProduct) soup = BeautifulSoup(productData.read()) dt = soup.find('div',attrs={'id':'tgddComment'}) s = BeautifulSoup(str(dt).lower()) info = s.find('div',attrs={'id':'tgddcomment'}) url = self.getUrlComment(info['cateid'], info['detailid']) rattingUrl = self.getUrlRatting(info['cateid'], info['detailid']) rattingData = urllib2.urlopen(rattingUrl).read() jsonData = urllib2.urlopen(url) data = jsonData.read() if len(str(data)) > 0: rattingData = rattingData[2:] rattingData = rattingData[:-2] data = data[1:] data = data[:-1] commentData = json.loads(data) count += 1 isOk = True else: print productName + " No Comment\n" if(isOk): path = cate + "\\" + no_accent_vietnamese(productName).replace('/','-') + ".txt" print path fileData = open(path,"w") fileData.write("<name>" + productName + "</name>") fileData.write("\n") fileData.write("<link>" + linkProduct + "</link>") fileData.write("\n") fileData.write("<ratting>" + rattingData + "</ratting>") fileData.write("\n") fileData.write("<!-- Comments -->") fileData.write("\n") pos = -1 for cmData in commentData: pos += 1 id = cmData['Id'].encode('utf-8') try: parentId = cmData['ParentId'].encode('utf-8') except Exception: parentId = "null"; content = cmData['Content'].encode('utf-8') author = cmData['UserId'].encode('utf-8') date = cmData['CreatedDate'].encode('utf-8').replace("/Date(","") date = date.replace(")/", "") try: timeComment = self.convertTime(date) except Exception: timeComment = 'null' newcomment = Comment("null",content,author, timeComment, str(cmData['LikeCounts']),"null", id, parentId); if parentId == "0" or parentId == "null": comment = newcomment.makeAComment() soupComment.insert(pos,comment) else: soupComment.insert(pos,newcomment.makeASubComment()) fucksoup = BeautifulSoup(str(soupComment)) maintag = fucksoup.findAll('tag') subtag= fucksoup.findAll('subtag') for t in maintag: t.properties['reply'] = 0 for ts in subtag: if t.properties['id'] == ts.properties['parentid']: if t.properties['reply'] == 0: subcm = "\n\t\t" + str(ts) + "\n" + "::" else: subcm = "\t" + str(ts) + "\n" + "::" t.properties['reply'] += 1 t.comment.insert(t.properties['reply'],subcm) del(ts) ff = str(t).replace('::',(' '*4)) + "\n" fileData.write(ff) fileData.close() print no_accent_vietnamese(productName) + " Done \n" print count
def sunset_embed(body, request=False): # Moved the import down here to avoid a circular import from sunset.models import image self_closing = [ 'sunset', ] if body and "<sunset" in body: body_raw = BeautifulSoup(body, selfClosingTags=self_closing) imglist = body_raw.findAll('sunset') for imgtag in imglist: err = 'Unknown error parsing Sunset embed tag' new_tag = '' img_pk = imgtag.get('id', False) cur_type = imgtag.get('type', 'icon') if img_pk: img_check = image.objects.filter(pk=int(img_pk)).filter( access_query(request)).select_related('cat') if img_check: cur_img = img_check.first() asset_check = cur_img.assets.filter(type=cur_type) if asset_check: cur_asset = asset_check.first() new_tag = BeautifulSoup(selfClosingTags=self_closing) new_a = Tag(new_tag, 'a') new_img = Tag(new_tag, 'img') new_a['class'] = 'sunset_embed sunset_%s' % cur_type new_a['href'] = cur_img.get_absolute_url() new_a['title'] = cur_img new_img['alt'] = cur_img new_img['title'] = cur_img new_img['src'] = cur_asset.get_url() new_tag.insert(0, new_a) new_a.insert(0, new_img) err = False else: err = 'Sunset image asset type specified in embed tag was not found' else: err = 'Sunset image specified in embed tag was not found' else: err = 'Invalid or missing image ID in Sunset embed tag' if err: imgtag.replaceWith( Comment('%s. Original was: %s' % (err, imgtag))) else: imgtag.replaceWith(new_tag) return unicode(body_raw) else: # Nothing to do. return body
response = br.follow_link(text_regex=r"UK") data = br.response().read() # pick out anchors that are tagged with the story class soup = BeautifulSoup(data) # tags = soup.findAll("a", "story") tags = soup.findAll("a") newSoup = BeautifulSoup() base = "http://www.bbc.co.uk" for tag in tags: # add base url if it is missing from href if tag[u'href'][0] == "/": tag[u'href'] = base + tag[u'href'] # add tag to new soup followed by a <br> newSoup.insert(0, tag) newSoup.insert(0, Tag(soup, "br")) # convert soup into a string data = str(newSoup) # save scraped info to a file try: f = open("out.html", "w") f.write(data) f.close() except IOError, e: print e # display local file in browser try:
+ str(int(value_killer)) + '%')) divtag_t4 = Tag(htmldata, "div") divtag_t4.insert( 0, NavigableString('Total percentage of NOT USEFUL data: ' + str(int(value_unc)) + '%')) divtag_t5 = Tag(htmldata, "div") divtag_t5.insert( 0, NavigableString( 'NOTE: The chart takes into account also the simple affidability criteria' )) htmldata.insert(0, htmltag) htmltag.insert(0, headtag) headtag.insert(0, titletag) htmltag.insert(1, bodytag) bodytag.insert(0, divtag_wrap) divtag_wrap.insert(0, imgtag) divtag_wrap.insert(1, divtag_t1) divtag_wrap.insert(2, divtag_t2) divtag_wrap.insert(3, divtag_t3) divtag_wrap.insert(4, divtag_t4) divtag_wrap.insert(5, divtag_t5) #print(htmldata)
def LIST_MOVIES(): if (common.addon.getSetting('enablelibraryfolder') == 'true'): SetupAmazonLibrary() elif (common.addon.getSetting('customlibraryfolder') <> ''): CreateDirectory(MOVIE_PATH) CreateDirectory(TV_SHOWS_PATH) import movies as moviesDB movies = moviesDB.loadMoviedb(favorfilter=True) for asin, movietitle, url, poster, plot, director, writer, runtime, year, premiered, studio, mpaa, actors, genres, stars, votes, TMDBbanner, TMDBposter, TMDBfanart, isprime, watched, favor, TMDB_ID in movies: CreateStreamFile(movietitle, url, MOVIE_PATH) soup = BeautifulSoup() movie = Tag(soup, "movie") soup.insert(0, movie) movie.insert(0, createElement('title', movietitle + ' (Amazon)')) if year: movie.insert(1, createElement('year', str(year))) if premiered: movie.insert(1, createElement('premiered', premiered)) if plot: movie.insert(2, createElement('plot', plot)) if runtime: movie.insert(2, createElement('runtime', runtime)) if votes: movie.insert(3, createElement('votes', str(votes))) if stars: movie.insert(4, createElement('rating', str(stars))) if director: movie.insert(5, createElement('director', director)) if studio: movie.insert(6, createElement('studio', studio)) if poster: movie.insert(7, createElement('thumb', poster)) if mpaa: movie.insert(8, createElement('mpaa', mpaa)) u = sys.argv[0] u += '?url="' + urllib.quote_plus(url) + '"' u += '&mode="play"' u += '&name="' + urllib.quote_plus(movietitle) + '"' utrailer = u + '&sitemode="PLAYTRAILER"' movie.insert(9, createElement('trailer', utrailer)) fileinfo = createElement('fileinfo', '') streamdetails = createElement('streamdetails', '') audio = createElement('audio', '') audio.insert(0, createElement('channels', '2')) audio.insert(1, createElement('codec', 'aac')) streamdetails.insert(0, audio) video = createElement('video', '') video.insert(0, createElement('codec', 'h264')) video.insert(1, createElement('height', '400')) video.insert(2, createElement('width', '720')) video.insert(4, createElement('scantype', 'Progressive')) streamdetails.insert(1, video) fileinfo.insert(0, streamdetails) movie.insert(10, fileinfo) index = 10 if genres: for genre in genres.split(','): index += 1 movie.insert(index, createElement('genre', genre)) if actors: for actor in actors.split(','): if actor <> None: index += 1 actortag = createElement('actor', '') actorname = createElement('name', actor) actortag.insert(0, actorname) movie.insert(index, actortag) movieNFO = os.path.join(MOVIE_PATH, movietitle + '.nfo') file = open(movieNFO, 'w') file.write(str(soup)) file.close()
from BeautifulSoup import BeautifulSoup, Tag, NavigableString soup = BeautifulSoup() tag1 = Tag(soup, "person") tag2 = Tag(soup, "name", [("first","John"),("last","Smith")]) tag3 = Tag(soup, "location", [("country", "uk")]) soup.insert(0, tag1) tag1.insert(0, tag2) tag1.insert(1, tag3) print soup text = NavigableString("John Gary Smith") tag2.insert(0, text) print soup.prettify() 1
def mexhelpextract(mexnames): #print 'processing mex files: ' + mexnames.__repr__() from ConfigParser import RawConfigParser as ConfigParser, Error as error for mexname in mexnames: # ConfigParser for the three elements per subfunctions written to tmpdir # [SubFunction] # usage: 'xyz' # help: 'xyz' # seealso: 'xyz' config = ConfigParser({'usage': [], 'help': [], 'seealso': []}) # assemble command line for matlab matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \ (_tmpdir, \ os.path.splitext(os.path.basename(_mexscript))[0], \ mexname, \ _tmpdir) cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd # and execute matlab w/ the temporary script we wrote earlier try: print 'running MATLAB for %s in %s' % (mexname, _tmpdir) stdin, stderr = os.popen4(cmd) print stderr.read() stdin.close() stderr.close() except: print 'could not dump help for %s into %s' % (mexname, _tmpdir) cfgfile = config.read(os.path.join(_tmpdir, mexname)) if cfgfile == []: print "skipping " + mexname + " (no output)" continue subfunctions = config.sections() print 'processing subfunctions: ' + subfunctions.__repr__() for subfunction in subfunctions: # read in the strings for this subfunction usage = config.get(subfunction, 'usage') help = config.get(subfunction, 'help') seealso = config.get(subfunction, 'seealso') headline = '===[[' + subfunction + ' ' + mexname + '(\'' + subfunction + '\')]]===\n' breadcrumb = "==[[Psychtoolbox]] › [[" \ + mexname + "]].{mex*,dll} subfunction==\n\n" # scrub the text for main text only body = beackern(help) docstring = '' \ + '%%(matlab;Usage)' \ + usage \ + '%%\n' \ + body \ + '\n\n' if seealso: docstring = docstring + '<<=====See also:=====\n' + seealso + '<<' text = '""' + headline \ + breadcrumb \ + docstring + '""' # retrieve old body text, to update or concatenate with synonymous subfunctions # # browse the page title = re.sub("[^\w]|_", "", subfunction) try: resp = mech.open(baseurl + title + "/edit") except HTTPError, e: sys.exit( "retrieving old text during posting of this mex function failed: %d: %s" % (e.code, e.msg)) # get text from the edit form mech.select_form(nr=1) try: oldbody = mech["body"] except: print 'No id="body" form. Figure this out first. cf. page text above.' for form in mech.forms(): print form sys.exit( "retrieving old body text failed while processing page: " + baseurl + title + '/edit') # parse embedded structuring HTML tags in the wiki text soup = BeautifulSoup(oldbody) # check if the subfunction is already present, by CSS 'class' and 'id' subfct = soup.find('div', {'class': "subfct", 'id': mexname}) if subfct: # replace the text of the container DIV subfct.contents[0].replaceWith(text) else: # contruct new DIV to hold the text subfctDIV = Tag(soup, "div") subfctDIV['class'] = 'subfct' subfctDIV['id'] = mexname subfctDIV.insert(0, NavigableString(text)) # insert the new div soup.insert(len(soup), subfctDIV) # Now scoop the good well-formed divs out of the soup divs = soup('div', {'class': "subfct"}) # and drop them into fresh yummy cheese soup cheesesoup = BeautifulSoup() # drop good divs into the soup, one by one for div in divs: # remove the unneeded style attribute, we finally # have this stuff defined in the ptbdocs.css now. del (div['style']) # escape the HTML tags for wiki parser cheesesoup.append(NavigableString('\n""')) cheesesoup.append(div) cheesesoup.append(NavigableString('""\n')) post(subfunction, cheesesoup.renderContents())
def save(self): soup = BeautifulSoup() client_tag = Tag(soup, 'Client') soup.insert(0, client_tag) i = 0 method = "POST" try: id_tag = Tag(soup, 'ID') id_tag.insert(0, NavigableString('%d' % self.id)) client_tag.insert(i, id_tag) i = i+1 method = "PUT" except AttributeError: pass try: name_tag = Tag(soup, 'Name') name_tag.insert(0, NavigableString(self.name)) client_tag.insert(i, name_tag) i = i+1 except AttributeError: raise ValueError("You must provide client's name.") try: if self.address: address_tag = Tag(soup, 'Address') address_tag.insert(0, NavigableString(self.address)) client_tag.insert(i, address_tag) i = i+1 except AttributeError: pass try: if self.postal_address: postal_address_tag = Tag(soup, 'PostalAddress') postal_address_tag.insert(0, NavigableString(self.postal_address)) client_tag.insert(i, postal_address_tag) i = i+1 except AttributeError: pass try: if self.phone: phone_tag = Tag(soup, 'Phone') phone_tag.insert(0, NavigableString(self.phone)) client_tag.insert(i, phone_tag) i = i+1 except AttributeError: pass try: if self.fax: fax_tag = Tag(soup, 'Fax') fax_tag.insert(0, NavigableString(self.fax)) client_tag.insert(i, fax_tag) i = i+1 except AttributeError: pass try: if self.website: website_tag = Tag(soup, 'WebSite') website_tag.insert(0, NavigableString(self.website)) client_tag.insert(i, website_tag) i = i+1 except AttributeError: pass try: if self.referral_source: referral_source_tag = Tag(soup, 'ReferralSource') referral_source_tag.insert(0, NavigableString(self.referral_source)) client_tag.insert(i, referral_source_tag) except AttributeError: pass if method == "PUT": response = rest_client.Client("").PUT(self.put, str(soup)) else: response = rest_client.Client("").POST(self.post, str(soup)) return Client(xml=response.content)
def save(self): soup = BeautifulSoup() contact_tag = Tag(soup, 'Contact') soup.insert(0, contact_tag) i = 0 method = "PUT" try: id_tag = Tag(soup, 'ID') id_tag.insert(0, NavigableString('%d' % self.id)) contact_tag.insert(i, id_tag) i = i+1 except AttributeError: pass try: client_tag = Tag(soup, 'Client') client_id_tag = Tag(soup, 'ID') client_id_tag.insert(0, NavigableString('%d' % self.owner_id)) client_tag.insert(0, client_id_tag) contact_tag.insert(i, client_tag) i = i+1 method = "POST" except AttributeError: pass try: name_tag = Tag(soup, 'Name') name_tag.insert(0, NavigableString(self.name)) contact_tag.insert(i, name_tag) i = i+1 except AttributeError: raise ValueError("You must provide client's name.") try: if self.mobile: mobile_tag = Tag(soup, 'Mobile') mobile_tag.insert(0, NavigableString(self.mobile)) contact_tag.insert(i, mobile_tag) i = i+1 except AttributeError: pass try: if self.email: email_tag = Tag(soup, 'Email') email_tag.insert(0, NavigableString(self.email)) contact_tag.insert(i, email_tag) i = i+1 except AttributeError: pass try: if self.phone: phone_tag = Tag(soup, 'Phone') phone_tag.insert(0, NavigableString(self.phone)) contact_tag.insert(i, phone_tag) i = i+1 except AttributeError: pass try: if self.position: position_tag = Tag(soup, 'Position') position_tag.insert(0, NavigableString(self.position)) contact_tag.insert(i, position_tag) i = i+1 except AttributeError: pass if method == "PUT": response = rest_client.Client("").PUT(self.put % self.id, str(soup)) else: response = rest_client.Client("").POST(self.post, str(soup)) return Contact(xml=response.content)
def mexhelpextract(mexnames): #print 'processing mex files: ' + mexnames.__repr__() from ConfigParser import RawConfigParser as ConfigParser, Error as error for mexname in mexnames: # ConfigParser for the three elements per subfunctions written to tmpdir # [SubFunction] # usage: 'xyz' # help: 'xyz' # seealso: 'xyz' config = ConfigParser({'usage':[], 'help':[], 'seealso':[]}) # assemble command line for matlab matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \ (_tmpdir, \ os.path.splitext(os.path.basename(_mexscript))[0], \ mexname, \ _tmpdir) cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd # and execute matlab w/ the temporary script we wrote earlier try: print 'running MATLAB for %s in %s' % (mexname,_tmpdir) stdin, stderr = os.popen4(cmd) print stderr.read() stdin.close() stderr.close() except: print 'could not dump help for %s into %s' % (mexname,_tmpdir) cfgfile = config.read(os.path.join(_tmpdir,mexname)) if cfgfile == []: print "skipping " + mexname + " (no output)" continue subfunctions = config.sections() print 'processing subfunctions: ' + subfunctions.__repr__() for subfunction in subfunctions: # read in the strings for this subfunction usage = config.get(subfunction,'usage') help = config.get(subfunction,'help') seealso = config.get(subfunction,'seealso') headline = '===[['+subfunction+' '+mexname+'(\''+subfunction+'\')]]===\n' breadcrumb = "==[[Psychtoolbox]] › [[" \ + mexname + "]].{mex*,dll} subfunction==\n\n" # scrub the text for main text only body = beackern(help) docstring = '' \ + '%%(matlab;Usage)' \ + usage \ + '%%\n' \ + body \ + '\n\n' if seealso: docstring = docstring + '<<=====See also:=====\n' + seealso + '<<' text = '""' + headline \ + breadcrumb \ + docstring + '""' # retrieve old body text, to update or concatenate with synonymous subfunctions # # browse the page title = re.sub("[^\w]|_","",subfunction) try: resp = mech.open(baseurl+title+"/edit") except HTTPError, e: sys.exit("retrieving old text during posting of this mex function failed: %d: %s" % (e.code, e.msg)) # get text from the edit form mech.select_form(nr=1) try: oldbody = mech["body"] except: print 'No id="body" form. Figure this out first. cf. page text above.' for form in mech.forms(): print form sys.exit("retrieving old body text failed while processing page: " + baseurl + title +'/edit') # parse embedded structuring HTML tags in the wiki text soup = BeautifulSoup(oldbody) # check if the subfunction is already present, by CSS 'class' and 'id' subfct = soup.find('div', {'class' : "subfct", 'id' : mexname}) if subfct: # replace the text of the container DIV subfct.contents[0].replaceWith(text) else: # contruct new DIV to hold the text subfctDIV = Tag(soup, "div") subfctDIV['class'] = 'subfct' subfctDIV['id'] = mexname subfctDIV.insert(0,NavigableString(text)) # insert the new div soup.insert(len(soup),subfctDIV) # Now scoop the good well-formed divs out of the soup divs = soup('div', {'class' : "subfct"}) # and drop them into fresh yummy cheese soup cheesesoup = BeautifulSoup() # drop good divs into the soup, one by one for div in divs: # remove the unneeded style attribute, we finally # have this stuff defined in the ptbdocs.css now. del(div['style']) # escape the HTML tags for wiki parser cheesesoup.append(NavigableString('\n""')) cheesesoup.append(div) cheesesoup.append(NavigableString('""\n')) post(subfunction,cheesesoup.renderContents())