示例#1
0
    def test_new_tag(self):
        from souplib import new_tag

        self.assertEquals(text_type(new_tag(self.soup, 'a', t='s')), '<a t="s"></a>')
        self.assertEquals(text_type(new_tag(self.soup, 'a', 't')), '<a>t</a>')
        self.assertEquals(
            text_type(new_tag(self.soup, 'a', new_tag(self.soup, 'b'))),
            '<a><b></b></a>')
示例#2
0
 def refactor(self, soup, md):
     d = BeautifulSoup('<body></body>')
     body = d.find('body')
     linked = 0
     notlinked = 0
     multiple = 0
     for p in self._chunks(soup):
         if not isinstance(p, list):
             p = [p]
         for pp in p:
             if pp.is_header:
                 continue
             elif pp.is_refs:
                 md['refs'] = [self.get_ref(line[0]) for line in pp.lines]
             else:
                 ex = None
                 if pp.is_example:
                     container = d.new_tag(
                         'blockquote',
                         **{
                             'class': 'example',
                             'style': 'font-size:100%;padding-left:1.8em;margin-left:0.3em'})
                     #body.append(Tag(name='hr'))
                 else:
                     container = body
                 for e, line, t in pp.lines:
                     body.append(e)
                     if pp.is_example:
                         if re.match('\([0-9]+\)', line):
                             e.attrs['style'] = 'text-indent:-2em'
                         equo = "’".decode('utf8')
                         if line.startswith("‘".decode('utf8')) and equo in line:
                             line = equo.join(line[1:].split(equo)[:-1]).strip()
                             examples = self.examples.get(slug(line))
                             if examples:
                                 if len(examples) > 1:
                                     #print '~~~', line
                                     multiple += 1
                                 else:
                                     ex = examples.values()[0]
                                     #print '+++'
                                     linked += 1
                             else:
                                 #print '---', line
                                 notlinked += 1
                     container.append(e)
                 if pp.is_example:
                     if ex:
                         container.attrs['id'] = 'ex-' + ex
                         container.append(new_tag(d, 'small', new_tag(
                             d, 'a', 'See example ' + ex, href='/sentences/' + ex)))
                     body.append(container)
     #print 'examples:', linked, 'linked,', notlinked, 'not linked,', multiple, 'multiple choices'
     for e in body.find_all('font'):
         e.unwrap()
     return d
示例#3
0
    def _paragraphs(self, soup):
        lines = []
        refs = False

        for e in soup.find_all(['p', 'table']):
            t = text(e)

            if e.name == 'table':
                if re.match('[\-\s]+excl\s+', t) \
                        or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t):
                    e.replace_with(new_tag(soup, 'p', 'value-table'))
                    break

            if e.name == 'p':
                if re.match('1\.\s+(.+?)\s+[0-9]+$', t):
                    ex = []
                    for p in next_siblings(e):
                        tt = text(p)
                        if p.name != 'p' or not re.match(
                                '[0-9]\.\s+(.+?)\s+[0-9]+$', tt):
                            break
                        ex.append(p)
                    if ex:
                        for ee in ex:
                            ee.extract()
                        e.replace_with(new_tag(soup, 'p', 'value-table'))
                        break

        for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])):
            if e.parent.name in ['li', 'td']:
                continue

            #print t
            br = t == self.BR
            if t in ['References', 'Reference']:
                refs = True
                t = ''
            elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$',
                                        t):
                e.name = 'h3'
            elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t):
                e.name = 'h4'
            elif t.endswith('and the APiCS Consortium'):
                continue

            if br and not refs:
                if lines:
                    yield Paragraph(lines)
                    lines = []
            if t and t != self.BR:
                lines.append((e, t, e.name))

        if lines:
            yield Paragraph(lines, refs=refs)
示例#4
0
    def _paragraphs(self, soup):
        lines = []
        refs = False

        for e in soup.find_all(['p', 'table']):
            t = text(e)

            if e.name == 'table':
                if re.match('[\-\s]+excl\s+', t) \
                        or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t):
                    e.replace_with(new_tag(soup, 'p', 'value-table'))
                    break

            if e.name == 'p':
                if re.match('1\.\s+(.+?)\s+[0-9]+$', t):
                    ex = []
                    for p in next_siblings(e):
                        tt = text(p)
                        if p.name != 'p' or not re.match('[0-9]\.\s+(.+?)\s+[0-9]+$', tt):
                            break
                        ex.append(p)
                    if ex:
                        for ee in ex:
                            ee.extract()
                        e.replace_with(new_tag(soup, 'p', 'value-table'))
                        break

        for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])):
            if e.parent.name in ['li', 'td']:
                continue

            #print t
            br = t == self.BR
            if t in ['References', 'Reference']:
                refs = True
                t = ''
            elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t):
                e.name = 'h3'
            elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t):
                e.name = 'h4'
            elif t.endswith('and the APiCS Consortium'):
                continue

            if br and not refs:
                if lines:
                    yield Paragraph(lines)
                    lines = []
            if t and t != self.BR:
                lines.append((e, t, e.name))

        if lines:
            yield Paragraph(lines, refs=refs)
示例#5
0
    def __call__(self, outdir):
        """
        runs a parser workflow consisting of
        - preprocess
        - refactor
        - postprocess
        writes the results, an html, a css and a json file to disk.
        """
        cssutils_logger = logging.getLogger('CSSUTILS')
        cssutils_logger.setLevel(logging.ERROR)
        print(self.fname.namebase.encode('utf8'))

        with open(self.fname, encoding='utf8') as fp:
            c = fp.read()
        soup = BeautifulSoup(self.preprocess(self._preprocess(c)))

        # extract css from the head section of the HTML doc:
        css = cssutils.parseString('\n')
        for style in soup.find('head').find_all('style'):
            for rule in self.cssrules(style):
                css.add(rule)

        md = dict(outline=[], refs=[], authors=[])
        soup = self.refactor(soup, md)

        # enhance section headings:
        for section, t in tag_and_text(soup.find_all('h3')):
            t = t.split('[Note')[0]
            id_ = 'section-%s' % slug(t)
            md['outline'].append((t, id_))
            section.attrs['id'] = id_
            for s, attrs in [
                (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}),
                ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}),
            ]:
                append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs))

        body = self.insert_links(unicode(soup.find('body')), md)

        # write output files:
        with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp:
            fp.write(self.wrap(self.postprocess(body)))

        with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp:
            fp.write(self.csstext(css))

        md['authors'] = list(self.yield_valid_authors(md['authors']))
        jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
示例#6
0
 def refactor(self, soup, md):
     d = BeautifulSoup('<body></body>')
     body = d.find('body')
     linked = 0
     notlinked = 0
     multiple = 0
     for p in self._chunks(soup):
         if not isinstance(p, list):
             p = [p]
         for pp in p:
             if pp.is_header:
                 continue
             elif pp.is_refs:
                 md['refs'] = [self.get_ref(line[0]) for line in pp.lines]
             else:
                 ex = None
                 if pp.is_example:
                     container = d.new_tag(
                         'blockquote', **{
                             'class':
                             'example',
                             'style':
                             'font-size:100%;padding-left:1.8em;margin-left:0.3em'
                         })
                     #body.append(Tag(name='hr'))
                 else:
                     container = body
                 for e, line, t in pp.lines:
                     body.append(e)
                     if pp.is_example:
                         if re.match('\([0-9]+\)', line):
                             e.attrs['style'] = 'text-indent:-2em'
                         equo = "’".decode('utf8')
                         if line.startswith(
                                 "‘".decode('utf8')) and equo in line:
                             line = equo.join(
                                 line[1:].split(equo)[:-1]).strip()
                             examples = self.examples.get(slug(line))
                             if examples:
                                 if len(examples) > 1:
                                     #print '~~~', line
                                     multiple += 1
                                 else:
                                     ex = examples.values()[0]
                                     #print '+++'
                                     linked += 1
                             else:
                                 #print '---', line
                                 notlinked += 1
                     container.append(e)
                 if pp.is_example:
                     if ex:
                         container.attrs['id'] = 'ex-' + ex
                         container.append(
                             new_tag(
                                 d, 'small',
                                 new_tag(d,
                                         'a',
                                         'See example ' + ex,
                                         href='/sentences/' + ex)))
                     body.append(container)
     #print 'examples:', linked, 'linked,', notlinked, 'not linked,', multiple, 'multiple choices'
     for e in body.find_all('font'):
         e.unwrap()
     return d
示例#7
-1
 def popover(number, note):
     # we use BeautifulSoup to fix broken markup, e.g. incomplete span tags.
     note = BeautifulSoup(normalize_whitespace(note)).find('body')
     note.name = 'div'
     a = new_tag(
         soup,
         'a',
         new_tag(soup, 'sup', number),
         **{
             'style': 'text-decoration: underline; cursor: pointer;',
             'class': 'popover-note',
             'data-original-title': 'Note %s' % number,
             'data-content': unicode(note),
             })
     return unicode(a)