def __init__(self, path): """ self.unit - the unit to which this Chapter belongs (e.g., 'Pathways & Advance Engineering') self.data - TabData instances for each topic """ self.data = [] s = utils.getHtml(path) filename = os.path.basename(path) self.unit = os.path.basename(os.path.dirname(path)) self.num, self.chapter = self.getChapterInfo(filename) tagPat = RegExUtils.getTagPattern('x:ExcelWorkbook') m = tagPat.search(s) if not m: raise Exception, "could not get TABS data from file (%s)" % path print 'found data' xml = m.group(0).replace('x:', '') # strip x prefix from all elements rec = XmlRecord(xml=xml) rec.xpath_delimiter = '/' tabNodes = rec.selectNodes( rec.dom, "ExcelWorkbook/ExcelWorksheets/ExcelWorksheet") # we ignore the 'Cover sheet' print 'creating %d tabs' % len(tabNodes) for tabElement in tabNodes: tabData = TabData(tabElement, self.unit) if tabData.name.lower() != 'cover sheet': tabData.num = len(self) + 1 self.append(tabData)
def pp (self): """ use """ tagPat = self.getTagPat() i = 0 # index into the original xml s = self.rec.__repr__() print "pp processing %d lines" % len (s.split('\n')) reps = 0 buff = "" while i <= len(s): m = tagPat.search (s,i) if m is None: buff += s[i:] i = len(s) # print 'pattern not found' break else: j = m.start() buff += s[i:j] # print 'pattern found (%d): "%s"' % (j, m.group(1)) tag = m.group(1) tagSet = RegExUtils.getTagPattern (tag).match(s, j) if tagSet: # print tagSet.group(1) # this is the content of this tagSet if self.isLeaf (tagSet): # print "%s is a leaf" % tag fixedLeaf = self.fixLeaf (tagSet) # print 'fixed leaf: "%s"' % fixedLeaf buff += fixedLeaf i=tagSet.end() else: ## print "%s is NOT a leaf" % tag buff += s[j] i = j + 1 else: # we have an empty tag (e.g., <children/>), just add it to buff and continue buff += m.group() i = j + len(m.group()) ## print 'set i to %d' % i reps += 1 if reps % 10 == 0: # print dot every once in a while to show progress sys.__stderr__.write('.') # print "%d/%d" % (i, len(s)) if reps > 50000: print "breaking because limit has been reached!" sys.exit() return '<?xml version="1.0" encoding="UTF-8" ?>\n%s' % buff.strip()
def removeFontTags(content): out = "" i = 0 # tag = "FONT" # patStr = "<%s.*?>(.*?)</%s>" % (tag, tag) # tagPat = re.compile (patStr, re.DOTALL) tagPat = RegExUtils.getTagPattern("FONT") while i < len(content): m = tagPat.match(content[i:]) if not m: out = out + content[i] i = i + 1 continue out = out + m.group(1) i = i + m.end() return out
def getTopicData(self, path): """ read an html file extract a table containing topicData use the table data to instantiate a TopPickData instance """ html = utils.getHtml(path) tablePat = RegExUtils.getTagPattern('table') m = tablePat.search(html) if not m: raise Exception, "Data Table not found!" xml = m.group() # xml = stripConditionals (xml) xml = utils.xcelHtml2Xml(xml) # print xml return TopicRecord(xml).parsed_data
import sys, re from serviceclient import SimpleClient from JloXml import RegExUtils as rexml h2Tag_pat = rexml.getTagPattern ('h2') attr_pat = rexml.attrPattern('name') a_pat = rexml.getTagPattern('a') def get_modules(url): client = SimpleClient (url) html = client.getData() m = h2Tag_pat.findall(html) modules = [] if 0 and m: print 'm is TRUE' print ' there are %d in m' % len(m) print 'the first element is a %s' % type (m[0]) if not m: return [] for s in m: m2 = attr_pat.match (s) if not m2:
print 'getHtml() encoding: %s' % myEncoding print ' path:', path html = codecs.open(path, 'r', myEncoding).read() # html = open(path,'r').read() lines = html.split(linesep) if verbose: print '%d lines read' % len(lines) if linesep == '\r': print 'spliting on mac NL' html = u'\n'.join(lines) return html condPat = re.compile('<!\[if[\s]+.*?<!\[endif\]>', re.DOTALL) hrefPat = RegExUtils.attrPattern("href") tagPat = re.compile('<([^\s^/^<]+)[\s]*?[^<]*?>', re.DOTALL) # matches opening tag (e.g., '<table ...>' htmlCommentPat = re.compile('<!--.*?-->', re.DOTALL) def xcelHtml2Xml(html): """ clean up the html so it can be processed as XML. this involves stripping attributes, which often are not quoted """ clean = "" i = 0 while i < len(html): ch = html[i]