def add_desc(soup, incr_version=0.01): # /description/document-info/program-used # /description/document-info/version add_if_not_exists(soup, soup.FictionBook.description, ['document-info', 'version']) add_if_not_exists(soup, soup.FictionBook.description, ['document-info', 'program-used']) di = soup.FictionBook.description.find('document-info', recursive=False) # increase version version = di.version text = version.string if text: try: text = float(text) except: traceback.print_exc() else: text += incr_version text = '%.2f' % text version.string.replaceWith(BeautifulSoup.NavigableString(text)) else: version.insert(0, BeautifulSoup.NavigableString('0.01')) # add program-used program_used = di.find('program-used', recursive=False) text = program_used.string if text: text = '%s, %s' % (text, _program_name) program_used.string.replaceWith(BeautifulSoup.NavigableString(text)) else: program_used.insert(0, BeautifulSoup.NavigableString(_program_name))
def linkify_text_node(node): index = node.parent.contents.index(node) parent = node.parent string = unicode(node) matches = URL_RE.finditer(string) end_re = re.compile('\W') new_content = [] o = 0 for m in matches: s, e = m.span() # if there are no more characters after the link # or if the character after the link is not a 'word character' if e >= len(string) or end_re.match(string[e]): link = BeautifulSoup.Tag(self._soup, 'a', attrs=[('href',m.group())]) link_text = BeautifulSoup.NavigableString(m.group()) link.insert(0, link_text) if o < s: # BeautifulSoup can't cope when we insert an empty text node previous_text = BeautifulSoup.NavigableString(string[o:s]) new_content.append(previous_text) new_content.append(link) o = e # Only do actual replacement if necessary if o > 0: if o < len(string): final_text = BeautifulSoup.NavigableString(string[o:]) new_content.append(final_text) # replace the text node with the new text node.extract() for x in new_content: parent.insert(index, x) index += 1
def separate_strings(current, next): if is_text(current): if is_text(next): # Two strings are beside eachother, merge them! next.extract() s = unicode(current) + unicode(next) s = BeautifulSoup.NavigableString(s) current.replaceWith(s) return s else: # The current string is as big as its going to get. # Check if you can split off some whitespace from # the beginning. p = unicode(current) split = start_space.split(p) if len(split) > 1 and split[1]: # BeautifulSoup can't cope when we insert # an empty text node. par = current.parent index = par.contents.index(current) current.extract() w = BeautifulSoup.NavigableString(" ") s = BeautifulSoup.NavigableString(split[1]) par.insert(index, s) par.insert(index, w) return next
def setter(self, text): if self.tag.string: self.tag.contents[0] = BeautifulSoup.NavigableString(text) else: self.tag.append(text) self.tag.string = self.tag.contents[0]
def setter(self, value): tag = self.doc for part in parts: if part == '': continue elif part == 'text()': if tag.string: tag.contents[0] = BeautifulSoup.NavigableString(value) else: tag.append(value) tag.string = tag.contents[0] return else: child = tag.find(part) if not child: child = BeautifulSoup.Tag(self.doc, part) tag.append(child) tag = child tag.append(value)
def condense_whitespace(): # Go over every string, replacing all whitespace with a single space for string in self.root.findAll(text=True): s = unicode(string) s = any_space.sub(" ", s) s = BeautifulSoup.NavigableString(s) string.replaceWith(s)
def makeTag(name, string=None, attrs=None): if attrs is not None: attrs = attrs.items() tag = bs.Tag(writer,name,attrs) if string is not None: tag.append(bs.NavigableString(string)) return tag
def GenerateHTML(self, controller, minify=False): soup = polymer_soup.PolymerSoup(str(self._soup)) # Remove decl for x in soup.contents: if isinstance(x, BeautifulSoup.Declaration): if _IsDoctype(x): x.extract() # Remove all imports imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all inline script scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all inline styles inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(str(style.string)) if html: ns = BeautifulSoup.Tag(soup, 'style') ns.append(BeautifulSoup.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = polymer_soup.PolymerSoup(html).findChildren() assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text:isinstance(text, BeautifulSoup.Comment)) for comment in comments: comment.extract() # We is done. return str(soup)
def encode_xml_specials(self) : """ BeautifulSoup will let some dangerous xml entities hang around in the navigable strings. destroy all monsters. >>> c = Cleaner(auto_clean=True, encode_xml_specials=True) >>> c('<<<<<') u'<<<<' """ for string in self.root.findAll(text=True): s = unicode(string) s = encode_xhtml_entities(s) s = BeautifulSoup.NavigableString(s) string.replaceWith(s)
def reassign_whitespace(): strings = self.root.findAll(text=True) i = len(strings) - 1 after = None while i >= 0: current = strings[i] if is_text(after) and not after.strip(): # if 'after' holds only whitespace, # remove it, and append it to 'current' s = unicode(current) + unicode(after) s = BeautifulSoup.NavigableString(s) current.replaceWith(s) after.extract() current = s after = current i -= 1
def createTextNode(self, data): return Text(self, BeautifulSoup.NavigableString(data))
def main(): global homes global page_num print("reading pages...") page_num = 17 while page_num < 600: curr_home = {} print("page " + str(page_num)) page_one = BeautifulSoup.BeautifulSoup(open("page%03i.html" % page_num)) # ** special case fix for page 329, a typo for the 'Beds and Rooms' ** if page_num == 329: bad_span = page_one.find(text=re.compile("951")) bad_span.string.replaceWith("Beds and Rooms") process_page_one(page_one, curr_home) print("page " + str(page_num + 1)) page_two = BeautifulSoup.BeautifulSoup( open("page%03i.html" % (page_num + 1))) # ** special case fixes for page 98 and 190, these pages have a blank # space rather than a value of 'N/A" in the field. We "fix" this by injecting # a value into the parsed page because it's cleaner that putting a # special case check in the processing code if page_num + 1 == 98: new_div = BeautifulSoup.Tag(page_two, "div") new_div[ "style"] = "top: 228px; left:481px; height:10px; width:10px;" new_span = BeautifulSoup.Tag(page_two, "span") new_span.insert(0, BeautifulSoup.NavigableString("N/A")) new_div.insert(0, new_span) page_two.html.body.insert(0, new_div) elif page_num + 1 == 190: new_div = BeautifulSoup.Tag(page_two, "div") new_div[ "style"] = "top: 232px; left:481px; height:10px; width:10px;" new_span = BeautifulSoup.Tag(page_two, "span") new_span.insert(0, BeautifulSoup.NavigableString("N/A")) new_div.insert(0, new_span) page_two.html.body.insert(0, new_div) elif page_num + 1 == 350: new_div = BeautifulSoup.Tag(page_two, "div") new_div[ "style"] = "top: 111px; left:249px; height:10px; width:10px;" new_span = BeautifulSoup.Tag(page_two, "span") new_span.insert(0, BeautifulSoup.NavigableString("N/A")) new_div.insert(0, new_span) page_two.html.body.insert(0, new_div) new_div = BeautifulSoup.Tag(page_two, "div") new_div[ "style"] = "top: 111px; left:518px; height:10px; width:10px;" new_span = BeautifulSoup.Tag(page_two, "span") new_span.insert(0, BeautifulSoup.NavigableString("N/A")) new_div.insert(0, new_span) page_two.html.body.insert(0, new_div) if page_num + 1 == 292: #handle a typo for Little Mountain Place where the Care Services has BC instead of BC Avg in a subheader bad_span = page_two.find(text=re.compile("BC$")) bad_span.string.replaceWith("BC Avg") process_page_two(page_two, curr_home) homes.append(curr_home) page_num += 2 # save as a json file print("writing...") outfile = open("homes.json", "w") outfile.write(json.dumps( homes, indent=4, sort_keys=True, )) outfile.close()
"xcodebuild -exportArchive -exportFormat APP -archivePath ~/Desktop/Flashlight.xcarchive -exportPath ~/Desktop/Flashlight.app" ) os.system("""pushd ~/Desktop zip -r Flashlight.zip Flashlight.app popd""") signature = subprocess.check_output([ "sh", "../Flashlight signing/sign_update.sh", os.path.expanduser("~/Desktop/Flashlight.zip"), "../Flashlight signing/dsa_priv.pem" ]).strip() import BeautifulSoup as bs soup = bs.BeautifulSoup(open("Appcast.xml").read()) c = soup.find("channel") item = c.find("item") new_item = bs.BeautifulSoup(str(item)) new_item.find("title").contents = [bs.NavigableString("Version " + v)] new_item.find("sparkle:releasenoteslink").contents = [ bs.NavigableString("http://flashlightupdates.42pag.es/" + v) ] enc = new_item.find("enclosure") enc['sparkle:version'] = vn enc['sparkle:dsasignature'] = signature enc['url'] = "https://github.com/nate-parrott/Flashlight/releases/download/v{0}/Flashlight.zip".format( v) enc['sparkle:shortversionstring'] = v c.insert(c.contents.index(item), new_item) open("Appcast.xml", "w").write(str(soup))
def clean(soup, toc, ref): # Rewrite links for link in soup.findAll('a'): href = link.get('href') if href is None: print >> sys.stderr, "WARNING: Link with no href:", link continue if href.startswith('#') and href != '#': href = href[1:] if soup.find(attrs={'id': href}) is not None or soup.find( "a", attrs={'name': href}) is not None: # Link to an element in the page continue target = toc.walk_id(href) if target is None: print >> sys.stderr, "WARNING: Link to an unknown ToC entry \"%s\"" % href continue link['href'] = target.link(ref) # Access elements by id to keep a reference before removing their id attribute headerElmt = soup.find("div", attrs={'id': 'header'}) tocElmt = soup.find("div", attrs={'id': 'toc'}) footerElmt = soup.find("div", attrs={'id': 'footer'}) # Prefer class to id for id in [ 'header', 'toc', 'toctitle', 'preamble', 'content', 'footer', 'footer-text' ]: elmt = soup.find(attrs={'id': id}) if elmt is not None: elmt['class'] = (elmt.get('class', '') + ' ' + elmt['id']).strip() del elmt['id'] # Add icon in header if headerElmt is not None: iconElmt = BeautifulSoup.Tag(soup, 'div', attrs={'class': 'page-badge'}) headerElmt.insert(0, iconElmt) # Add breadcrumb in header if headerElmt is not None: breadcrumbElmt = BeautifulSoup.Tag(soup, 'div', attrs={'class': 'breadcrumb'}) for i, entry in enumerate(ref.get_ancestry()[:-1]): if i > 0: breadcrumbElmt.append(BeautifulSoup.NavigableString(u' » ')) linkElmt = BeautifulSoup.Tag(soup, 'a', attrs={'href': entry.link(ref)}) linkElmt.append( BeautifulSoup.NavigableString( entry.title if not entry.is_root() else 'Docs')) breadcrumbElmt.append(linkElmt) headerElmt.insert(1, breadcrumbElmt) # Add ToC in header if tocElmt is not None: # Remove toc's noscript noscript = tocElmt.find("noscript") if noscript is not None: noscript.decompose() # causes problems with subsequent soup.find() # Inject ToC tocHTMLBuffer = cStringIO.StringIO() ref.write_html(tocHTMLBuffer, open=ref) tocHTML = tocHTMLBuffer.getvalue().decode('utf-8') tocHTMLBuffer.close() if tocHTML == u'': # Remove ToC if empty tocElmt.decompose() else: tocTags = BeautifulSoup.BeautifulSoup(tocHTML) tocElmt.append(tocTags) # Use a wrapper div wrapper = BeautifulSoup.Tag(soup, 'div', attrs={'class': 'tocwrapper'}) tocElmt.replaceWith(wrapper) wrapper.append(tocElmt) # Add license in footer if footerElmt is not None: footerLicense = BeautifulSoup.BeautifulSoup(""" <div class="footer-license"> Except as otherwise noted, <span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">WonderPush Documentation</span> by <a xmlns:cc="http://creativecommons.org/ns#" href="http://www.wonderpush.com/docs" property="cc:attributionName" rel="cc:attributionURL">WonderPush</a> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>, and code samples are licensed under the <a rel="license" href="http://www.apache.org/licenses/LICENSE-2.0">Apache 2.0 License</a>. </div>""") footerElmt.insert(0, footerLicense) # Return just the interesting html, not the boilerplate rtn = soup.body.extract() rtn.name = 'div' return rtn
documentContents = weoSoup.kml.document.contents for tag in documentContents: try: if tag.name == 'name': nameString = tag.string break except: pass overlayString = str(weoSoup.find('groundoverlay')) overlaySoup = BeautifulSoup.BeautifulSoup(overlayString) overlayContents = overlaySoup.contents for tag in overlayContents: if tag.name == 'name': tag.replaceWith('<name>' + nameString + '</name>') overlaySoup.groundoverlay['id'] = fileName visibilityTag = BeautifulSoup.NavigableString( '<visibility>0</visibility>') overlaySoup.groundoverlay.insert(0, visibilityTag) kmlOverlayString += str(overlaySoup).replace( 'groundoverlay', 'GroundOverlay').replace('latlonbox', 'LatLonBox').replace('icon', 'Icon') Folders = weoSoup.kml.document.findAll('folder') for folder in Folders: try: Coords = folder.placemark.point.coordinates.contents[0].split( ',') except: pass try: linRingCoords = folder.placemark.polygon.outerboundaryis.linearring.coordinates.contents[ 0].split(' ')[0]
def render_hashed(request, key, user, extracontext={}): ###Need to get all of the rendered html ###and integrate via Beautiful if 'TYPE' in extracontext: htmlrender = extracontext['TYPE'] == 'HTML' else: htmlrender = 'JS' if key is None: key = request.META['PATH_INFO'] empty = True if user is None: user = request.user retdict = get_cache_or_render(user, key, empty, forcerender=True, request=request, extracontext=extracontext) rendered_list = retdict['rendered_list'] ret = defaultdict(list) for i in rendered_list: for k, v in i.items(): if k != 'html': print k + ' ' + str(v) if type(i['html']) == ListType: for v, k in i['html']: soup = BeautifulSoup.BeautifulSoup(v) if i['type'] == 'html': ret[i['div']] = [soup] elif i['type'] == 'append': ret[i['div']].append(soup) else: #print i['div'] + ' : ' + i['type'] soup = BeautifulSoup.BeautifulSoup(i['html']) if '#pages' in ret and i['div'] != '#tab_ruler' and i[ 'type'] != 'html': text = ret['#pages'][0].find('div', {'id': i['div'][1:]}) #print text if text is not None: text.insert(0, BeautifulSoup.NavigableString(i['html'])) elif i['type'] == 'html': ret[i['div']] = [soup] elif i['type'] == 'append': ret[i['div']].append(soup) elif i['type'] == 'prepend': ret[i['div']].insert(0, soup) else: print i['type'] text = ret[i['type']][len(ret[i['type']]) - 1].find( 'div', {'id': i['div'][1:]}) #print text if text is not None: text.insert(0, BeautifulSoup.NavigableString(i['html'])) rendertype = retdict['rendertype'] final = {} for k, v in ret.items(): r = '' for val in v: r += val.prettify() if htmlrender: final[k[1:]] = r else: final[k] = r return { 'renders': final, 'object': retdict['object'], 'rendertype': rendertype, 'counts': retdict['counts'] }