def test_new_tag(self): from souplib import new_tag self.assertEquals(text_type(new_tag(self.soup, 'a', t='s')), '<a t="s"></a>') self.assertEquals(text_type(new_tag(self.soup, 'a', 't')), '<a>t</a>') self.assertEquals( text_type(new_tag(self.soup, 'a', new_tag(self.soup, 'b'))), '<a><b></b></a>')
def refactor(self, soup, md): d = BeautifulSoup('<body></body>') body = d.find('body') linked = 0 notlinked = 0 multiple = 0 for p in self._chunks(soup): if not isinstance(p, list): p = [p] for pp in p: if pp.is_header: continue elif pp.is_refs: md['refs'] = [self.get_ref(line[0]) for line in pp.lines] else: ex = None if pp.is_example: container = d.new_tag( 'blockquote', **{ 'class': 'example', 'style': 'font-size:100%;padding-left:1.8em;margin-left:0.3em'}) #body.append(Tag(name='hr')) else: container = body for e, line, t in pp.lines: body.append(e) if pp.is_example: if re.match('\([0-9]+\)', line): e.attrs['style'] = 'text-indent:-2em' equo = "’".decode('utf8') if line.startswith("‘".decode('utf8')) and equo in line: line = equo.join(line[1:].split(equo)[:-1]).strip() examples = self.examples.get(slug(line)) if examples: if len(examples) > 1: #print '~~~', line multiple += 1 else: ex = examples.values()[0] #print '+++' linked += 1 else: #print '---', line notlinked += 1 container.append(e) if pp.is_example: if ex: container.attrs['id'] = 'ex-' + ex container.append(new_tag(d, 'small', new_tag( d, 'a', 'See example ' + ex, href='/sentences/' + ex))) body.append(container) #print 'examples:', linked, 'linked,', notlinked, 'not linked,', multiple, 'multiple choices' for e in body.find_all('font'): e.unwrap() return d
def _paragraphs(self, soup): lines = [] refs = False for e in soup.find_all(['p', 'table']): t = text(e) if e.name == 'table': if re.match('[\-\s]+excl\s+', t) \ or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t): e.replace_with(new_tag(soup, 'p', 'value-table')) break if e.name == 'p': if re.match('1\.\s+(.+?)\s+[0-9]+$', t): ex = [] for p in next_siblings(e): tt = text(p) if p.name != 'p' or not re.match( '[0-9]\.\s+(.+?)\s+[0-9]+$', tt): break ex.append(p) if ex: for ee in ex: ee.extract() e.replace_with(new_tag(soup, 'p', 'value-table')) break for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])): if e.parent.name in ['li', 'td']: continue #print t br = t == self.BR if t in ['References', 'Reference']: refs = True t = '' elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t): e.name = 'h3' elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t): e.name = 'h4' elif t.endswith('and the APiCS Consortium'): continue if br and not refs: if lines: yield Paragraph(lines) lines = [] if t and t != self.BR: lines.append((e, t, e.name)) if lines: yield Paragraph(lines, refs=refs)
def _paragraphs(self, soup): lines = [] refs = False for e in soup.find_all(['p', 'table']): t = text(e) if e.name == 'table': if re.match('[\-\s]+excl\s+', t) \ or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t): e.replace_with(new_tag(soup, 'p', 'value-table')) break if e.name == 'p': if re.match('1\.\s+(.+?)\s+[0-9]+$', t): ex = [] for p in next_siblings(e): tt = text(p) if p.name != 'p' or not re.match('[0-9]\.\s+(.+?)\s+[0-9]+$', tt): break ex.append(p) if ex: for ee in ex: ee.extract() e.replace_with(new_tag(soup, 'p', 'value-table')) break for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])): if e.parent.name in ['li', 'td']: continue #print t br = t == self.BR if t in ['References', 'Reference']: refs = True t = '' elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t): e.name = 'h3' elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t): e.name = 'h4' elif t.endswith('and the APiCS Consortium'): continue if br and not refs: if lines: yield Paragraph(lines) lines = [] if t and t != self.BR: lines.append((e, t, e.name)) if lines: yield Paragraph(lines, refs=refs)
def __call__(self, outdir): """ runs a parser workflow consisting of - preprocess - refactor - postprocess writes the results, an html, a css and a json file to disk. """ cssutils_logger = logging.getLogger('CSSUTILS') cssutils_logger.setLevel(logging.ERROR) print(self.fname.namebase.encode('utf8')) with open(self.fname, encoding='utf8') as fp: c = fp.read() soup = BeautifulSoup(self.preprocess(self._preprocess(c))) # extract css from the head section of the HTML doc: css = cssutils.parseString('\n') for style in soup.find('head').find_all('style'): for rule in self.cssrules(style): css.add(rule) md = dict(outline=[], refs=[], authors=[]) soup = self.refactor(soup, md) # enhance section headings: for section, t in tag_and_text(soup.find_all('h3')): t = t.split('[Note')[0] id_ = 'section-%s' % slug(t) md['outline'].append((t, id_)) section.attrs['id'] = id_ for s, attrs in [ (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}), ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}), ]: append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs)) body = self.insert_links(unicode(soup.find('body')), md) # write output files: with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp: fp.write(self.wrap(self.postprocess(body))) with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp: fp.write(self.csstext(css)) md['authors'] = list(self.yield_valid_authors(md['authors'])) jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
def refactor(self, soup, md): d = BeautifulSoup('<body></body>') body = d.find('body') linked = 0 notlinked = 0 multiple = 0 for p in self._chunks(soup): if not isinstance(p, list): p = [p] for pp in p: if pp.is_header: continue elif pp.is_refs: md['refs'] = [self.get_ref(line[0]) for line in pp.lines] else: ex = None if pp.is_example: container = d.new_tag( 'blockquote', **{ 'class': 'example', 'style': 'font-size:100%;padding-left:1.8em;margin-left:0.3em' }) #body.append(Tag(name='hr')) else: container = body for e, line, t in pp.lines: body.append(e) if pp.is_example: if re.match('\([0-9]+\)', line): e.attrs['style'] = 'text-indent:-2em' equo = "’".decode('utf8') if line.startswith( "‘".decode('utf8')) and equo in line: line = equo.join( line[1:].split(equo)[:-1]).strip() examples = self.examples.get(slug(line)) if examples: if len(examples) > 1: #print '~~~', line multiple += 1 else: ex = examples.values()[0] #print '+++' linked += 1 else: #print '---', line notlinked += 1 container.append(e) if pp.is_example: if ex: container.attrs['id'] = 'ex-' + ex container.append( new_tag( d, 'small', new_tag(d, 'a', 'See example ' + ex, href='/sentences/' + ex))) body.append(container) #print 'examples:', linked, 'linked,', notlinked, 'not linked,', multiple, 'multiple choices' for e in body.find_all('font'): e.unwrap() return d
def popover(number, note): # we use BeautifulSoup to fix broken markup, e.g. incomplete span tags. note = BeautifulSoup(normalize_whitespace(note)).find('body') note.name = 'div' a = new_tag( soup, 'a', new_tag(soup, 'sup', number), **{ 'style': 'text-decoration: underline; cursor: pointer;', 'class': 'popover-note', 'data-original-title': 'Note %s' % number, 'data-content': unicode(note), }) return unicode(a)