def anchorArticles(txt): # find all textnodes starting with Article, wrapping this in a named <a> and prepending a hoverable link to this anchor aregex=re.compile('^\s*Article\s+[0-9][0-9.,]*', re.I) nsoup = BeautifulSoup(txt) node=nsoup.find(text=aregex) while node: nodeidx=node.parent.contents.index(node) match=str(re.match(aregex,node).group()) # create named <a> name=match.replace(' ','_') a=Tag(nsoup,'a',[('name',name)]) a.insert(0,match) # create a link that is displayed if the <a> is hovered link=Tag(nsoup,'a', [('class',"anchorLink"), ('href','#'+name)]) link.insert(0,"#") # create a container for the a and the link hover=Tag(nsoup,'span',[('class','hover')]) hover.insert(0,a) hover.insert(0,link) node.parent.insert(nodeidx,hover) # cut the newly wrapped from the original node. newNode=NavigableString(node[len(match):]) node.replaceWith(newNode) node=newNode.findNext(text=aregex) return str(nsoup)
def load_scripts(pkg): """ Given a package expand ul#scripts to include the contents of any scripts """ script_ul = SOUP.find("ul", {"id": "scripts"}) script_ul.contents = [] for f in os.listdir(pkg): if splitext(f)[1] != '.pkg': continue script_dir = join(pkg, f, 'Scripts') script_list = Tag(SOUP, 'ul') for script in os.listdir(script_dir): if script == "Tools": continue script_li = Tag(SOUP, 'li') script_li['class'] = 'code' script_path = join(script_dir, script) if isfile(script_path): script_li.append(join(f, 'Scripts', script)) script_li.append(anchor_for_name(script_path)) script_pre = Tag(SOUP, 'pre') script_pre.append(NavigableString(open(script_path).read())) script_li.append(script_pre) elif isdir(script_path): subscript_files = os.listdir(script_path) if not subscript_files: continue script_li.append("%s Scripts" % join(f, 'Scripts', script)) subscripts = Tag(SOUP, 'ul') for subscript in subscript_files: subscript_path = join(script_path, subscript) subscript_li = Tag(SOUP, 'li') subscript_li.append(subscript) subscript_li.append(anchor_for_name(subscript_path)) subscript_pre = Tag(SOUP, 'pre') subscript_pre.append( NavigableString(open(subscript_path).read())) subscript_li.append(subscript_pre) subscripts.append(subscript_li) script_li.append(subscripts) script_list.append(script_li) if script_list.contents: new_scripts = Tag(SOUP, 'li') new_scripts.append(NavigableString("%s Scripts" % f)) new_scripts.append(script_list) script_ul.append(new_scripts)
def generateContentDivTag(baseDir, h3text): import __main__ contentDivTag = Tag(formatSoup, 'div', attrs={'class' : 'content band-content'}) # 表題埋め込み h3tag = Tag(formatSoup, 'h3') h3tag.append(NavigableString(h3text)) contentDivTag.append(h3tag) # HTML生成 for file in os.listdir(PARENT_DIR + baseDir): if file.endswith(SHTML_EXT): # バンド名ulタグを生成 progreUlTag = generateUlTag('/' + baseDir, file, 'column') albumLiTag = Tag(formatSoup, 'li') progreUlTag.append(albumLiTag) # 作品名ulタグを生成 fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, '/' + baseDir, file]))) albumList = [] for albumClassTag in fileSoup.findAll('a', {'class' : 'album-name'}): albumList.append(albumClassTag['href'].split('/')[-1]) __main__.contentCount += 1 albumDir = '/'.join([baseDir, file.split('.')[0]]) for album in albumList: albumUlTag = generateUlTag('/' + albumDir, album, 'child-column') albumLiTag.append(albumUlTag) contentDivTag.append(progreUlTag) return contentDivTag
def _set(self, topic, key, value, topic_attr=None): """Set key and value at topic :return: success status :rtype: bool""" # In case it is an empty document if not unicode(self._soup).strip().startswith("<?xml"): self._soup.insert(0, NavigableString(self.HEADER)) # In case settings root is not defined settings = self._soup.find(self.root) if settings is None: self._soup.insert(1, Tag(self._soup, self.root)) settings = self._soup.find(self.root) # Add Topic topic_tag = self._set_element(settings, topic.lower(), attr=topic_attr) if topic_tag is None: return False # Add key and value key_tag = self._set_element(topic_tag, key.lower(), escape(value)) # Add "" since XML may introduce whitespaces. #key_tag = self._set_element(topic_tag, key, '"{0}"'.format(value)) return key_tag is not None
def _set_element(self, root, tagname, text=None, attr=None): """Creates if not available an element at the soup root element :return: tag object or None :rtype: Tag """ # Add Topic if not available if attr is None: if root.find(re.compile(tagname + "$", re.I)) is None: new_tag = Tag(self._soup, tagname) root.insert(0, new_tag) else: if root.find(re.compile(tagname + "$", re.I), attr) is None: new_tag = Tag(self._soup, tagname, attr.items()) root.insert(0, new_tag) settings = self._soup.find(self.root) tag = settings.find(re.compile(tagname + "$", re.I)) # Something to insert if tag is not None and text is not None: if tag.text.strip() == "": tag.insert(0, NavigableString(text)) else: tag.contents[0].replaceWith(text) return tag
def select_calendar(month=None, year=None): now = datetime.now() day = now.day cal = calendar.HTMLCalendar() cal.setfirstweekday(6) month_table = cal.formatmonth(year, month) soup = BeautifulSoup(month_table) outfile = open("myHTML.html", 'w') for data in soup.findAll('td'): if data['class'] != "noday": days = data.findAll(text=True) for oneday in days: day = NavigableString(oneday) oneday.extract() addatag = Tag(soup, 'input') addatag['type'] = "submit" addatag['name'] = "meetingday" addatag['value'] = day data.insert(0, addatag) outfile.write(soup.prettify()) outfile.close() infile = open("myHTML.html", 'r') calfile = "" for line in infile: calfile = calfile + line infile.close() return calfile
def replaceJavascript(base_url, soup): for js in soup.findAll('script', {'src': re.compile('.+')}): try: real_js = get_content(resolve_path(base_url, js['src'])) real_js = real_js.replace('</', 'u003c/') js_tag = Tag(soup, 'script') js_tag.insert(0, NavigableString(real_js)) js.replaceWith(js_tag) except Exception,e: print 'failed to load javascript from %s' % js['src'] print e
def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): newNode = TextNode( NavigableString(self.element.contents[-1] + node.element), self.soup) self.element.contents[-1].extract() self.appendChild(newNode) else: self.element.insert(len(self.element.contents), node.element) node.parent = self
def createParentUlTag(targetSoup): parentUlTag = Tag(targetSoup, 'ul', attrs={ 'class': 'xbreadcrumbs', 'id': 'breadcrumbs' }) topListTag = Tag(targetSoup, 'li') topAnchorTag = Tag(targetSoup, 'a', attrs={'href': SITE_DOMAIN}) topAnchorTag.append(NavigableString('TOP')) topListTag.append(topAnchorTag) parentUlTag.append(topListTag) return parentUlTag
def generateUlTag(path, file, ulClass): # バンド名タグを生成 fileSoup = BeautifulSoup(open('/'.join([PARENT_DIR, path, file]))) text = fileSoup.find('h1').renderContents() ulTag = Tag(formatSoup, 'ul', attrs={'class' : ulClass}) liTag = Tag(formatSoup, 'li') link = '/'.join([path, file]) aTag = Tag(formatSoup, 'a', attrs={'href' : link}) aTag.append(NavigableString(text)) liTag.append(aTag) ulTag.append(liTag) return ulTag
def insertBefore(self, node, refNode): index = self.element.contents.index(refNode.element) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index - 1].__class__ == NavigableString): newNode = TextNode( NavigableString(self.element.contents[index - 1] + node.element), self.soup) self.element.contents[index - 1].extract() self.insertBefore(newNode, refNode) else: self.element.insert(index, node.element) node.parent = self
def findEntries(self): self.entries = [] headers = map(lambda x: x.string, self.soup.findAll('h1')[2:]) table = self.soup.findAll('div', id="bodyContent")[0] for table in table.findAll('table'): header = True for tr in table.findAll('tr'): if header: header = False continue i = 0 for th in tr.findAll('td'): description = '' if i == 0: name = ''.join(th.b.findAll(text=True)).replace( ' ', '') anchor = string.capitalize( urllib.quote(name.split('.')[0])) + "." if anchor in headers: url = self.baseURL + 'About:config_entries#' + anchor else: url = self.baseURL + 'About:config_entries' elif i == 1: value = th.text elif i == 2: if value: article = 'a' if value[0] == 'I': article += 'n' optionType = "it accepts " + article + " " + value.lower( ) + "." synopsis = '"' + name + '"' + ' is a configuration option ' \ 'for the Firefox web browser; ' + optionType + "<br>" for tag in th.findAll('br'): tag.insert(0, NavigableString("\n")) description = ''.join(th.findAll(text=True)) description = description.rstrip().replace( '\n', '<br>').strip() expandedURL = 'href="' + self.baseURL description = description.replace( 'href="/', expandedURL) description = re.sub('<\s*b\s*>', '<i>', description) description = re.sub('<\s*/\s*b\s*>', '</i>', description) description = '<blockquote>' + description + '</blockquote>' description = synopsis + description i = -1 self.entries.append( Entry(name, value, description.strip(), url)) i += 1
def insertBefore(self, node, refNode): index = self._nodeIndex(node, refNode) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index-1].__class__ == NavigableString): # (See comments in appendChild) newStr = NavigableString(self.element.contents[index-1]+node.element) oldNode = self.element.contents[index-1] del self.element.contents[index-1] oldNode.parent = None oldNode.extract() self.element.insert(index-1, newStr) else: self.element.insert(index, node.element) node.parent = self
def getNotes(self, target_language=None): notes = self.getTranslatedReferences(fieldname='notesToEditors', target_language=target_language) texts = [note.getText() for note in notes] # insert number here... for i in range(len(texts)): soup = BeautifulSoup(texts[i]) pTag = soup.p if not pTag: # can't don anything, continue continue substr = pTag.contents[0].string pTag.contents[0] = NavigableString( u'<span class="numbering">%d.</span> ' % (i + 1) + substr) texts[i] = str(soup) return texts
def get_list_for_key(name, children): """ Takes a key and a dictionary containing its children and recursively generates HTML lists items. Each item will contain the name and, if it has children, an unordered list containing those child items. """ li = Tag(SOUP, "li") li.append(NavigableString(name)) if children: ul = Tag(SOUP, "ul") for k, v in children.items(): ul.append(get_list_for_key(k, v)) li.append(ul) return li
def backupOriginal(): fh = open(sourceFile, 'r') html = fh.read() fh.close() soap = BeautifulSoup(html) div = soap.find("div", {"class": "tabs2"}) # Adding menu of letters at the end of navigation bar text = NavigableString(createMenu("All")) div.append(text) # div.insert(div.__len__(), createMenu("All")) html = soap.renderContents() output = open(PROJECT_LOCATION + "/doc/html/" + PREFIX + "All.html", "w") output.write(html) output.close()
def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # Concatenate new text onto old text node # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") newStr = NavigableString(self.element.contents[-1]+node.element) # Remove the old text node # (Can't simply use .extract() by itself, because it fails if # an equal text node exists within the parent node) oldElement = self.element.contents[-1] del self.element.contents[-1] oldElement.parent = None oldElement.extract() self.element.insert(len(self.element.contents), newStr) else: self.element.insert(len(self.element.contents), node.element) node.parent = self
def main(pkg_file_name, html_file_name): global SOUP print "Generating %s from %s" % (html_file_name, pkg_file_name) pkg = expand_pkg(pkg_file_name) SOUP = BeautifulSoup(open("wtfUpdate.html").read()) SOUP.find('title').contents = [ NavigableString("wtfUpdate: %s" % basename(pkg_file_name)) ] try: generate_package_report(pkg) html_file = open(html_file_name, 'w') html_file.write(str(SOUP)) html_file.close() except RuntimeError, exc: print >> sys.stderr, "ERROR: %s" % exc sys.exit(1)
def replaceCss(base_url,soup): for css in soup.findAll('link', {'rel':'stylesheet', 'href':re.compile('.+')}): try: real_css = get_content(resolve_path(base_url, css['href'])) def replacer(result): try: path = resolve_path(resolve_path(base_url,css['href']), result.groups()[0]) path = path.replace('"', '').replace("'", "") print path return u'url(%s)' % data_encode_image(path,get_content(path, True)) except Exception, e: print e return u'' style_tag = Tag(soup, 'style') style_tag.insert(0, NavigableString(re.sub(css_url, replacer, real_css))) css.replaceWith(style_tag) except Exception, e: print 'failed to load css from %s' % css['href'] print e
def get_converted_html(self, soup): #remote note for Chinese charactors: Tag <rt> #for nstring in soup.findAll( {'rt' : True}): # nstring.extract() #remote tag <ruby> but keep the char inside for pTag in soup.findAll({'p': True}): #only support calibre-converted ebook, for NOW if pTag['class'] != 'calibre': continue new_content = u'' for content in pTag.contents: if type(content) is NavigableString: new_content = new_content + content elif content.name == "ruby": for ruby_char in content.contents: #ignore <rt> if type(ruby_char) is NavigableString: new_content = new_content + ruby_char #print(new_content) #continue # new_content is a sentence. send it to Google translate try: en_text = self._conversion_engine.convert(new_content) except Exception, err: en_text = "TRANSLATE ERROR" print(err) print(new_content) if len(new_content) > 0: br1 = Tag(soup, "br") br2 = Tag(soup, "br") idx = len(pTag.contents) pTag.insert(idx, br1) pTag.insert(idx + 1, NavigableString(" " + en_text)) pTag.insert(idx + 2, br2)
def geo_term_extract(self, desc): data = values = { 'maxRows': '1', 'fuzzy': '1', 'country': 'EE', 'featureClass': 'P', 'operator': 'OR', 'username': self.geonames_user, 'q': desc.encode('utf-8') } data = urllib.urlencode(values) link = u"http://api.geonames.org/search" xmldata = urllib.urlopen(link, data) soup = BeautifulSoup(xmldata) # print soup.prettify() lng = '0' lat = '0' if len(soup.findAll("lat")) > 0: lng = soup.findAll("lng")[0].text lat = soup.findAll("lat")[0].text lat_f = float(lat) lng_f = float(lng) lat = '%.5f' % ((lat_f * 10000 + random.uniform(1, 80)) / 10000) lng = '%.5f' % ((lng_f * 10000 + random.uniform(1, 80)) / 10000) soup2 = BeautifulSoup() tag1 = Tag(soup2, "Point") tag2 = Tag(soup2, "coordinates") soup2.insert(0, tag1) tag1.insert(0, tag2) text = NavigableString(lng + "," + lat) tag2.insert(0, text) # print soup2 result = (soup2.__str__()).encode("utf-8") return [result, lat, lng]
def createHtmlPages(): HTMLHeader = getHeader() HTMLFooter = getFooter() for list in LINKS: letter = list[0] html = HTMLHeader for item in list[1:]: html += item + "\n" html += HTMLFooter soap = BeautifulSoup(html) div = soap.find("div", {"class": "tabs2"}) text = NavigableString(createMenu(letter)) div.append(text) # div.insert(div.__len__(), createMenu(letter)) html = soap.renderContents() path = PROJECT_LOCATION + "/doc/html/" + PREFIX + letter + ".html" output = open(path, "w") output.write(html) output.close() if letter == "A": output = open(sourceFile, "w") output.write(html) output.close() print PROJECT_LOCATION + "/doc/html/" + PREFIX + letter + ".html Done!"
res = subprocess.check_output(['curl', '-s', url]) soup = BeautifulStoneSoup(res) stations = soup.findAll('station') def station_distance_key(station): dist = station.find('distance_mi') # Airports don't have a distance_mi, calc via lat/long if dist==None: slat = float(station.find('lat').string) slong = float(station.find('lon').string) dist = math.sqrt((slat-lat)**2 + (slong-lon)**2) else: dist = float(dist.string) return dist stations = sorted(stations, key=station_distance_key) station = stations[0] if station.parent.name == 'pws': api_url = "http://api.wunderground.com/weatherstation/WXCurrentObXML.asp?ID=%s" % urllib.quote(NavigableString.__str__(station.find('id').contents[0].string)) else: api_url = "http://api.wunderground.com/weatherstation/WXCurrentObXML/index.xml?query=%s" % urllib.quote(NavigableString.__str__(station.find('icao').contents[0].string)) res = subprocess.check_output(['curl', '-s', api_url]) soup = BeautifulStoneSoup(res) res = "%s <fc=#AAAAFF>%iF</fc>" % (soup.find('city').string, int(float(soup.find('temp_f').string))) print res cache_file.write(res) cache_file.close() except: print "err"
def update_testCase_result(src, soup): #print src localtime = time.localtime() updateTime = "%s_%s_%s_%s_%s" % (localtime[0], localtime[1], localtime[2], localtime[3], localtime[4]) head = soup.h1 #update head head.contents[0].replaceWith("BU test report %s" % updateTime) table_map = { "BU sanity test result. URL:": [ "U6_BU_CI", ], } if not re.search("fp_version", src): tc_name = re.search("name=(.*?) ", src).group(1).strip("HZ-").strip() verdict = re.search("verdict=(.*?) ", src).group(1).strip() assc = re.search("assc=(.*?) ", src).group(1).strip() tw = re.search("tw=(.*?) ", src).group(1).strip() mgw = re.search("mgw=(.*?) ", src).group(1).strip() script = re.search("script=(.*?) ", src).group(1).strip() boa = re.search("boa=(.*?) ", src).group(1).strip() nelmon = re.search("nelmon=(.*?) ", src).group(1).strip() link = re.search("link=(.*)", src).group(1).strip() try: tc = soup.find(text=tc_name) #node of text:test case name in soup #print tc tc.previous['href'] = link #update link verdict_list = tc.parent.parent.findNextSiblings( 'td', limit=7) #verdict, tw, nelmon, assc, script, mgw, boa #print verdict_list #update verdict if "PASS" == verdict: tc.parent.parent['bgcolor'] = "green" verdict_list[0]['bgcolor'] = "green" verdict_list[0].contents[0].replaceWith("PASS") elif "FAIL" == verdict: tc.parent.parent['bgcolor'] = "red" verdict_list[0]['bgcolor'] = "red" verdict_list[0].contents[0].replaceWith("FAIL") elif "WARNING" == verdict: tc.parent.parent['bgcolor'] = 'yellow' verdict_list[0]['bgcolor'] = 'yellow' verdict_list[0].contents[0].replaceWith('WARNING') #update TW if "PASS" == tw: verdict_list[1]['bgcolor'] = "green" verdict_list[1].contents[0].replaceWith("PASS") elif "FAIL" == tw: verdict_list[1]['bgcolor'] = "red" verdict_list[1].contents[0].replaceWith("FAIL") #update Nelmon if "PASS" == nelmon: verdict_list[2]['bgcolor'] = "green" verdict_list[2].contents[0].replaceWith("PASS") elif "FAIL" == nelmon: verdict_list[2]['bgcolor'] = "red" verdict_list[2].contents[0].replaceWith("FAIL") #update assc if "PASS" == assc: verdict_list[3]['bgcolor'] = "green" verdict_list[3].contents[0].replaceWith("PASS") elif "FAIL" == assc: verdict_list[3]['bgcolor'] = "red" verdict_list[3].contents[0].replaceWith("FAIL") #update script if "PASS" == script: verdict_list[4]['bgcolor'] = "green" verdict_list[4].contents[0].replaceWith("PASS") elif "FAIL" == script: verdict_list[4]['bgcolor'] = "red" verdict_list[4].contents[0].replaceWith("FAIL") #update mgw if re.search("PASS", mgw): verdict_list[5]['bgcolor'] = "green" verdict_list[5].contents[0].replaceWith("PASS") elif re.search("FAIL", mgw): verdict_list[5]['bgcolor'] = "red" verdict_list[5].contents[0].replaceWith("FAIL") elif re.search("ALERT|CRITICAL", mgw): verdict_list[5]['bgcolor'] = "#800000" verdict_list[5].contents[0].replaceWith("CRITICAL") tc.parent.parent['bgcolor'] = "#800000" #update boa if "PASS" == boa: verdict_list[6]['bgcolor'] = "green" verdict_list[6].contents[0].replaceWith("PASS") elif "FAIL" == boa: verdict_list[6]['bgcolor'] = "red" verdict_list[6].contents[0].replaceWith("FAIL") except: print "%s haven't been included in BU test cases, please contact with BU team" % tc_name else: execution_name = re.search("execution=(.*?) ", src).group(1).strip() mgw_version = re.search("mgw_version=(.*?)il", src).group(1).strip() #il_version = re.search("il_version=(.*?) ", src).group(1).strip() #fp_version = re.search("fp_version=(.*?) ", src).group(1).strip() #prbs_version = re.search("prbs_version=(.*?) ", src).group(1).strip() url = re.search("url=(.*)", src).group(1).strip() #since there is "\n" at the end of every line, so need nextSibling 2 times #if mgw_version and il_version and fp_version and prbs is NA or empty, then update info. Otherwise, skip. #update mgw_version MGW = soup.find(text="release lable:") if MGW.parent.nextSibling.nextSibling.contents[0] == "NA" or \ MGW.parent.nextSibling.nextSibling.contents[0] == "": MGW.parent.nextSibling.nextSibling.contents[0].replaceWith( mgw_version) #update il_version #IL = soup.find(text="IL version:") #if IL.parent.nextSibling.nextSibling.contents[0] == "NA" or \ # IL.parent.nextSibling.nextSibling.contents[0] == "": # IL.parent.nextSibling.nextSibling.contents[0].replaceWith(il_version) #update fp_version #FP = soup.find(text="FP version:") #if FP.parent.nextSibling.nextSibling.contents[0] == "NA" or \ # FP.parent.nextSibling.nextSibling.contents[0] == "": # FP.parent.nextSibling.nextSibling.contents[0].replaceWith(fp_version) #updat prbs_version #PRBS = soup.find(text= "PRBs version:") #if PRBS.parent.nextSibling.nextSibling.a['href'] == "NA": # PRBS.parent.nextSibling.nextSibling.a['href'] = prbs_version # PRBS.parent.nextSibling.nextSibling.contents[0].contents[0].replaceWith(prbs_version) #updat urls for executions for k in table_map.keys(): n = 1 for i in table_map[k]: #if re.search(i, prbs_version): #Use in Open MGW if True: #use in IPA #print k if soup.find( text=re.compile("%s.*" % k) ) == None: #if update sanity test cases result, go to next execution break node = soup.find(text=re.compile("%s.*" % k)).parent temp_soup = BeautifulSoup() tag = Tag(temp_soup, 'a') text = NavigableString("%s" % url) tag.insert(0, text) tag['href'] = "%s" % url node.insert(n, tag) #print node n = n + 1
def mexhelpextract(mexnames): #print 'processing mex files: ' + mexnames.__repr__() from ConfigParser import RawConfigParser as ConfigParser, Error as error for mexname in mexnames: # ConfigParser for the three elements per subfunctions written to tmpdir # [SubFunction] # usage: 'xyz' # help: 'xyz' # seealso: 'xyz' config = ConfigParser({'usage': [], 'help': [], 'seealso': []}) # assemble command line for matlab matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \ (_tmpdir, \ os.path.splitext(os.path.basename(_mexscript))[0], \ mexname, \ _tmpdir) cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd # and execute matlab w/ the temporary script we wrote earlier try: print 'running MATLAB for %s in %s' % (mexname, _tmpdir) stdin, stderr = os.popen4(cmd) print stderr.read() stdin.close() stderr.close() except: print 'could not dump help for %s into %s' % (mexname, _tmpdir) cfgfile = config.read(os.path.join(_tmpdir, mexname)) if cfgfile == []: print "skipping " + mexname + " (no output)" continue subfunctions = config.sections() print 'processing subfunctions: ' + subfunctions.__repr__() for subfunction in subfunctions: # read in the strings for this subfunction usage = config.get(subfunction, 'usage') help = config.get(subfunction, 'help') seealso = config.get(subfunction, 'seealso') headline = '===[[' + subfunction + ' ' + mexname + '(\'' + subfunction + '\')]]===\n' breadcrumb = "==[[Psychtoolbox]] › [[" \ + mexname + "]].{mex*,dll} subfunction==\n\n" # scrub the text for main text only body = beackern(help) docstring = '' \ + '%%(matlab;Usage)' \ + usage \ + '%%\n' \ + body \ + '\n\n' if seealso: docstring = docstring + '<<=====See also:=====\n' + seealso + '<<' text = '""' + headline \ + breadcrumb \ + docstring + '""' # retrieve old body text, to update or concatenate with synonymous subfunctions # # browse the page title = re.sub("[^\w]|_", "", subfunction) try: resp = mech.open(baseurl + title + "/edit") except HTTPError, e: sys.exit( "retrieving old text during posting of this mex function failed: %d: %s" % (e.code, e.msg)) # get text from the edit form mech.select_form(nr=1) try: oldbody = mech["body"] except: print 'No id="body" form. Figure this out first. cf. page text above.' for form in mech.forms(): print form sys.exit( "retrieving old body text failed while processing page: " + baseurl + title + '/edit') # parse embedded structuring HTML tags in the wiki text soup = BeautifulSoup(oldbody) # check if the subfunction is already present, by CSS 'class' and 'id' subfct = soup.find('div', {'class': "subfct", 'id': mexname}) if subfct: # replace the text of the container DIV subfct.contents[0].replaceWith(text) else: # contruct new DIV to hold the text subfctDIV = Tag(soup, "div") subfctDIV['class'] = 'subfct' subfctDIV['id'] = mexname subfctDIV.insert(0, NavigableString(text)) # insert the new div soup.insert(len(soup), subfctDIV) # Now scoop the good well-formed divs out of the soup divs = soup('div', {'class': "subfct"}) # and drop them into fresh yummy cheese soup cheesesoup = BeautifulSoup() # drop good divs into the soup, one by one for div in divs: # remove the unneeded style attribute, we finally # have this stuff defined in the ptbdocs.css now. del (div['style']) # escape the HTML tags for wiki parser cheesesoup.append(NavigableString('\n""')) cheesesoup.append(div) cheesesoup.append(NavigableString('""\n')) post(subfunction, cheesesoup.renderContents())
def createElement(tagname, contents): soup = BeautifulSoup() element = Tag(soup, tagname) text = NavigableString(contents) element.insert(0, text) return element
def main(argv): parser = argparse.ArgumentParser(description='Downloads a website into a format suitable for use with phishing frenzy') parser.add_argument("site_addr", action="store", help="Site address") args = parser.parse_args() site_addr = args.site_addr ######################################### #Get stuff from config file ######################################### config_file = "config/website_cloner.config" if os.path.exists(config_file): pass else: try: print "Specified config file not found. Copying example config file..." shutil.copyfile("config/website_cloner.default", config_file) except: print "Error copying default config file...quitting execution..." sys.exit() config = ConfigParser.SafeConfigParser() config.read(config_file) try: working_dir = config.get("general", "working_dir") header_text = config.get("html", "header_text") body_text = config.get("html", "body_text") except: print "Missing required config file sections. Check running config file against provided example\n" sys.exit() site_path = site_addr.replace("http://","") site_path = site_path.replace("https://","") working_dir = os.path.join(working_dir, site_path,'') if not os.path.exists(working_dir): os.makedirs(working_dir) os.chdir(os.path.dirname(working_dir)) ######################################### #Get the site we are cloning ######################################### if not site_addr[:4] == "http": site_addr = "http://"+site_addr try: site_text=urllib2.urlopen(site_addr).read() except: print "Could not open site...quitting..." sys.exit() #soup=BeautifulSoup(header_text+site_text) soup=BeautifulSoup(site_text) head=soup.find('head') head.insert(0,NavigableString(header_text)) body=soup.find('body') body.insert(0,NavigableString(body_text)) ############################################### #Detect hyperlinked images and download locally ############################################### imageList = [] for tag in soup.findAll('img', src=True): imageList.append(tag['src']) if not imageList: pass else: for url in imageList: try: filename = url.split('/')[-1].split('#')[0].split('?')[0] soup = BeautifulSoup(str(soup).decode("UTF-8").replace(url,filename).encode("UTF-8")) if not url.startswith('http'): url = urllib2.urlparse.urljoin(site_addr,url) print "getting " + url + "..." open(filename,"wb").write(urllib2.urlopen(url, timeout=5).read()) except: pass cssList = [] for tag in soup.findAll('link', {'rel':'stylesheet'}): cssList.append(tag['href']) if not cssList: pass else: for url in cssList: try: filename = url.split('/')[-1].split('#')[0].split('?')[0] soup = BeautifulSoup(str(soup).decode("UTF-8").replace(url,filename).encode("UTF-8")) if not url.startswith('http'): url = urllib2.urlparse.urljoin(site_addr,url) print "getting " + url + "..." open(filename,"wb").write(urllib2.urlopen(url, timeout=5).read()) except: pass scriptList = [] for tag in soup.findAll('script', src=True): scriptList.append(tag['src']) if not scriptList: pass else: for url in scriptList: try: filename = url.split('/')[-1].split('#')[0].split('?')[0] soup = BeautifulSoup(str(soup).decode("UTF-8").replace(url,filename).encode("UTF-8")) if not url.startswith('http'): url = urllib2.urlparse.urljoin(site_addr,url) print "getting " + url + "..." open(filename,"wb").write(urllib2.urlopen(url, timeout=5).read()) except: pass ########################################## #Clean up html output and make it readable ########################################## mainpage = soup.prettify() mainpage = mainpage.replace('<','<') mainpage = mainpage.replace('>','>') open("index.php","wb").write(mainpage)
chart_url += str(value_killed) chart_url += ',' chart_url += str(value_killer) chart_url += '&chtt=Twitter+Analysis+Chart' #http://chart.apis.google.com/chart?chxl=0:|Policeman+Killed|Killed+by+police&chxs=0,676767,11.5,0,lt,676767&chxt=x&chbh=a,100&chs=300x200&cht=bvg&chco=FF0000&chd=t:30,70&chtt=Twitter+Analysis+Chart # Now, create a HTML page with the information # The paga is simple: head with title, body with a big div holding an image (the chart) and 5 additional divs with text htmldata = BeautifulSoup() htmltag = Tag(htmldata, "html") headtag = Tag(htmldata, "head") titletag = Tag(htmldata, "title") titletag.insert(0, NavigableString('Twitter Stream Analysis Example')) bodytag = Tag(htmldata, "body") imgtag = Tag(htmldata, "img") imgtag['src'] = chart_url divtag_wrap = Tag(htmldata, "div") divtag_t1 = Tag(htmldata, "div") divtag_t1.insert( 0, NavigableString('Total sentences analyzed: ' + str(total_sentences) + ' taken from 400 public tweets')) divtag_t2 = Tag(htmldata, "div") divtag_t2.insert(
def makeImagesLocal(soup, params): """ deal with internal and external image references """ for img in soup.findAll('img'): # 'internal' images are marked with class="internal resource" # in order to prevent image fetching later on if 'internal-resource' in (img.get('class') or ''): continue src = img['src'] if params['request'] and src.startswith(params['request'].BASE0) \ and '++resource++' not in src: src = src.replace(params['request'].BASE0 + '/', '') if src.startswith('http'): try: img_data = urllib2.urlopen(str(src)).read() except urllib2.URLError: LOG.warn('No image found: %s - removed from output' % src) img.extract() continue tmpname = tempfile.mktemp(dir=params['destdir']) file(tmpname, 'wb').write(img_data) img['src'] = os.path.basename(tmpname) else: # image with relative URL # first lookup image by direct traversal img_path = urllib.unquote(str(src)) img_obj = params['context'].restrictedTraverse(img_path, None) if img_obj is None: img_path2 = getToolByName( params['context'], 'portal_url').getPortalPath() + img_path img_obj = params['context'].restrictedTraverse(img_path2, None) if img_obj is None and 'resolveuid' in src: mo = uid_reg.search(src) if mo: uid = mo.group(0) img_obj = params['context'].reference_catalog.lookupObject( uid) # For scaled images ('_preview', '_large' etc.) use the original # image always (which is stored as acquisition parent) if img_obj: has_portal_type = hasattr(aq_base(img_obj.aq_inner), 'portal_type') if has_portal_type and img_obj.portal_type == img_obj.aq_parent.portal_type: img_obj = img_obj.aq_parent if img_obj is None: # nothing found, check the next parent node with a 'path' parameter # referring to the origin document parent_container_path = pathFromParent(soup, img) if parent_container_path is not None: img_obj = params['context'].restrictedTraverse( '%s/%s' % (parent_container_path, img_path), None) # still nothing found if img_obj is None: img_split = img_path.split('/') if img_split[-1].startswith( 'image_') or img_split[-1].startswith('image-'): img_path = '/'.join(img_split[:-1]) for image_path in params['images']: if image_path.endswith(img_path): img_obj = params['context'].restrictedTraverse( image_path, None) break # get hold of the image in original size if img_obj: # thumbnails have an Image as aq_parent if img_obj.aq_parent.portal_type == 'Image': img_obj = img_obj.aq_parent if img_obj: img_data = None for attr in ['data', '_data']: try: img_data = str(getattr(img_obj, attr)) continue except AttributeError: pass if img_data == None: LOG.warn('No image found: %s - removed from output' % img_path) img.extract() continue tmpname = tempfile.mktemp(dir=params['destdir']) file(tmpname, 'wb').write(img_data) img['src'] = os.path.basename(tmpname) # image scaling try: scale = img_obj.getField('pdfScale').get(img_obj) except AttributeError: scale = 100 # add content-info debug information # don't add scale as style since the outer image-container # has the style set img['scale'] = str(scale) # now move <img> tag into a dedicated <div> div = Tag(soup, 'div') div['class'] = 'image-container' # div['style'] = 'width: %d%%' % scale div['scale'] = str(scale) div.insert(0, copy.copy(img)) # image caption img_description = img_obj.Description() img_caption = Tag(soup, 'div') img_caption['class'] = 'image-caption' # exclude from image enumeration context = params['context'] exclude_field = img_obj.getField('excludeFromImageEnumeration') if exclude_field and not exclude_field.get(img_obj): span = Tag(soup, 'span') classes = ['image-caption-text'] description = img_obj.Description() if description: classes.append('image-caption-text-with-text') else: classes.append('image-caption-text-without-text') span['class'] = ' '.join(classes) if description: span.insert(0, NavigableString(description)) img_caption.insert(0, span) div.append(img_caption) img.replaceWith(div) else: LOG.warn('No image found: %s - not removed, keeping it' % img_path)
def insertText(self, data, insertBefore=None): text = TextNode(NavigableString(data), self.soup) if insertBefore: self.insertBefore(text, insertBefore) else: self.appendChild(text)
def main(): """Create an XML database containing a word from the GNT, its PROIEL ID # and other PROIEL info.""" aligned = codecs.open("aligned-gospels.wds", "rU", "utf-8") xml = codecs.open("proiel-GNT.xml", "rU", "utf-8") print "Parsing the PROIEL XML with BeautifulStoneSoup..." print proiel = BeautifulStoneSoup(xml) tokens = proiel.findAll('token') tok_dict = {} # creating a dictionary keyed by PROIEL IDs to speed up searching for token in tokens: tok_dict[token['id']] = token output = open("gospels-database.xml", "w") print >> output, "<div>" print >> output, "<title>Gospels</title>" count = 100001 soup = BeautifulStoneSoup() word = Tag(soup, "word") print "Iterating through the alignment file..." print for line in aligned: stuff = line.split("\t") word = Tag(soup, "word") form = NavigableString(stuff[0]) word.insert(0, form) # make it so that the IDs count up from 000000, not 100000 word['id'] = str(count).replace("1", "0", 1) word['proiel-id'] = stuff[1] # adding attributes from the PROIEL XML if stuff[1] != "000000" and stuff[1] != "999999" and stuff[1] != "111111": token = tok_dict[stuff[1]] morph = token['morph-features'].split(",") word['lemma'] = morph[0] word['proiel-pos'] = morph[1] word['lang'] = morph[2] word['morph'] = morph[3] word['deprel'] = token['relation'] try: word['proiel-head-id'] = token['head-id'] except KeyError: word['proiel-head-id'] = "root" word['proiel-form'] = stuff[2].rstrip() count += 1 print >> output, word print >> output, "</div>" print "Done!" print