Exemplo n.º 1
    def __init__(self, path):
		self.unit - the unit to which this Chapter belongs (e.g., 'Pathways & Advance Engineering')
		self.data - TabData instances for each topic
        self.data = []
        s = utils.getHtml(path)

        filename = os.path.basename(path)
        self.unit = os.path.basename(os.path.dirname(path))
        self.num, self.chapter = self.getChapterInfo(filename)
        tagPat = RegExUtils.getTagPattern('x:ExcelWorkbook')
        m = tagPat.search(s)
        if not m:
            raise Exception, "could not get TABS data from file (%s)" % path
        print 'found data'
        xml = m.group(0).replace('x:', '')  # strip x prefix from all elements

        rec = XmlRecord(xml=xml)
        rec.xpath_delimiter = '/'
        tabNodes = rec.selectNodes(
            rec.dom, "ExcelWorkbook/ExcelWorksheets/ExcelWorksheet")

        # we ignore the 'Cover sheet'
        print 'creating %d tabs' % len(tabNodes)
        for tabElement in tabNodes:
            tabData = TabData(tabElement, self.unit)
            if tabData.name.lower() != 'cover sheet':
                tabData.num = len(self) + 1
Exemplo n.º 2
	def pp (self):
		tagPat = self.getTagPat()
		i = 0 # index into the original xml
		s = self.rec.__repr__()
		print "pp processing %d lines" % len (s.split('\n'))
		reps = 0
		buff = ""
		while i <= len(s):
			m = tagPat.search (s,i)
			if m is None:
				buff += s[i:]
				i = len(s)
			#	print 'pattern not found'

				j = m.start()
				buff += s[i:j]

				# print 'pattern found (%d): "%s"' % (j, m.group(1))
				tag = m.group(1)
				tagSet = RegExUtils.getTagPattern (tag).match(s, j)
				if tagSet:
					# print tagSet.group(1)  # this is the content of this tagSet
					if self.isLeaf (tagSet):
					#	print "%s is a leaf" % tag
						fixedLeaf = self.fixLeaf  (tagSet)
						# print 'fixed leaf: "%s"' % fixedLeaf
						buff += fixedLeaf 

##						print "%s is NOT a leaf" % tag
						buff += s[j]
						i = j + 1
					# we have an empty tag (e.g., <children/>), just add it to buff and continue
					buff += m.group()
					i = j + len(m.group())
##				print 'set i to %d' % i

			reps += 1
			if reps % 10 == 0:
				# print dot every once in a while to show progress
				# print "%d/%d" % (i, len(s))
			if reps > 50000:
				print "breaking because limit has been reached!"
		return '<?xml version="1.0" encoding="UTF-8" ?>\n%s' % buff.strip()
Exemplo n.º 3
def removeFontTags(content):
    out = ""
    i = 0
    # tag = "FONT"
    # patStr = "<%s.*?>(.*?)</%s>" % (tag, tag)
    # tagPat = re.compile (patStr, re.DOTALL)
    tagPat = RegExUtils.getTagPattern("FONT")
    while i < len(content):
        m = tagPat.match(content[i:])
        if not m:
            out = out + content[i]
            i = i + 1
        out = out + m.group(1)
        i = i + m.end()
    return out
Exemplo n.º 4
    def getTopicData(self, path):
		read an html file
		extract a table containing topicData
		use the table data to instantiate a TopPickData instance

        html = utils.getHtml(path)
        tablePat = RegExUtils.getTagPattern('table')
        m = tablePat.search(html)

        if not m:
            raise Exception, "Data Table not found!"
        xml = m.group()

        # xml = stripConditionals (xml)
        xml = utils.xcelHtml2Xml(xml)
        # print xml
        return TopicRecord(xml).parsed_data
Exemplo n.º 5
import sys, re
from serviceclient import SimpleClient
from JloXml import RegExUtils as rexml

h2Tag_pat = rexml.getTagPattern ('h2')
attr_pat = rexml.attrPattern('name')
a_pat = rexml.getTagPattern('a')

def get_modules(url):

	client = SimpleClient (url)

	html = client.getData()

	m = h2Tag_pat.findall(html)

	modules = []
	if 0 and m:
		print 'm is TRUE'
		print ' there are %d in m' % len(m)
		print 'the first element is a %s' % type (m[0])

	if not m:
		return []
	for s in m:
		m2 = attr_pat.match (s)
		if not m2:
Exemplo n.º 6
    print 'getHtml() encoding: %s' % myEncoding
    print '  path:', path
    html = codecs.open(path, 'r', myEncoding).read()
    # html = open(path,'r').read()

    lines = html.split(linesep)
    if verbose:
        print '%d lines read' % len(lines)
        if linesep == '\r':
            print 'spliting on mac NL'
    html = u'\n'.join(lines)
    return html

condPat = re.compile('<!\[if[\s]+.*?<!\[endif\]>', re.DOTALL)
hrefPat = RegExUtils.attrPattern("href")
tagPat = re.compile('<([^\s^/^<]+)[\s]*?[^<]*?>',
                    re.DOTALL)  # matches opening tag (e.g., '<table ...>'
htmlCommentPat = re.compile('<!--.*?-->', re.DOTALL)

def xcelHtml2Xml(html):
	clean up the html so it can be processed as XML.
	this involves stripping attributes, which often are not quoted
    clean = ""
    i = 0
    while i < len(html):
        ch = html[i]