def _paragraphs(self, soup): lines = [] refs = False for e in soup.find_all(['p', 'table']): t = text(e) if e.name == 'table': if re.match('[\-\s]+excl\s+', t) \ or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t): e.replace_with(new_tag(soup, 'p', 'value-table')) break if e.name == 'p': if re.match('1\.\s+(.+?)\s+[0-9]+$', t): ex = [] for p in next_siblings(e): tt = text(p) if p.name != 'p' or not re.match( '[0-9]\.\s+(.+?)\s+[0-9]+$', tt): break ex.append(p) if ex: for ee in ex: ee.extract() e.replace_with(new_tag(soup, 'p', 'value-table')) break for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])): if e.parent.name in ['li', 'td']: continue #print t br = t == self.BR if t in ['References', 'Reference']: refs = True t = '' elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t): e.name = 'h3' elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t): e.name = 'h4' elif t.endswith('and the APiCS Consortium'): continue if br and not refs: if lines: yield Paragraph(lines) lines = [] if t and t != self.BR: lines.append((e, t, e.name)) if lines: yield Paragraph(lines, refs=refs)
def _paragraphs(self, soup): lines = [] refs = False for e in soup.find_all(['p', 'table']): t = text(e) if e.name == 'table': if re.match('[\-\s]+excl\s+', t) \ or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t): e.replace_with(new_tag(soup, 'p', 'value-table')) break if e.name == 'p': if re.match('1\.\s+(.+?)\s+[0-9]+$', t): ex = [] for p in next_siblings(e): tt = text(p) if p.name != 'p' or not re.match('[0-9]\.\s+(.+?)\s+[0-9]+$', tt): break ex.append(p) if ex: for ee in ex: ee.extract() e.replace_with(new_tag(soup, 'p', 'value-table')) break for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])): if e.parent.name in ['li', 'td']: continue #print t br = t == self.BR if t in ['References', 'Reference']: refs = True t = '' elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t): e.name = 'h3' elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t): e.name = 'h4' elif t.endswith('and the APiCS Consortium'): continue if br and not refs: if lines: yield Paragraph(lines) lines = [] if t and t != self.BR: lines.append((e, t, e.name)) if lines: yield Paragraph(lines, refs=refs)
def __call__(self, outdir): """ runs a parser workflow consisting of - preprocess - refactor - postprocess writes the results, an html, a css and a json file to disk. """ cssutils_logger = logging.getLogger('CSSUTILS') cssutils_logger.setLevel(logging.ERROR) print(self.fname.namebase.encode('utf8')) with open(self.fname, encoding='utf8') as fp: c = fp.read() soup = BeautifulSoup(self.preprocess(self._preprocess(c))) # extract css from the head section of the HTML doc: css = cssutils.parseString('\n') for style in soup.find('head').find_all('style'): for rule in self.cssrules(style): css.add(rule) md = dict(outline=[], refs=[], authors=[]) soup = self.refactor(soup, md) # enhance section headings: for section, t in tag_and_text(soup.find_all('h3')): t = t.split('[Note')[0] id_ = 'section-%s' % slug(t) md['outline'].append((t, id_)) section.attrs['id'] = id_ for s, attrs in [ (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}), ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}), ]: append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs)) body = self.insert_links(unicode(soup.find('body')), md) # write output files: with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp: fp.write(self.wrap(self.postprocess(body))) with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp: fp.write(self.csstext(css)) md['authors'] = list(self.yield_valid_authors(md['authors'])) jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
def refactor(self, soup, md): # clean attributes: def update_style(current): style = [] for rule in (current or '').split(';'): rule = rule.strip() # tab-stops:14.2pt text-indent:36.0pt if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']: rule = 'margin-top:0.4em' if normalize_whitespace(rule, repl='') in [ 'font-family:Junicode', 'font-family:JunicodeRegular', ]: continue if rule and not rule.startswith('mso-'): style.append(rule) return ';'.join(style) for e in descendants(soup.find('body')): update_attr(e, 'style', update_style) update_attr(e, 'lang', None) for e, t in tag_and_text(descendants(soup.find('body'), include=['p', 'h1', 'h2']), non_empty=False): if not t: e.extract() for p in soup.find_all('p'): if p.attrs.get('class') == ['Zitat']: p.wrap(soup.new_tag('blockquote')) continue if not p.parent.name == 'td': # need to detect headings by text, too! t = text(p) match = self.heading_pattern.match(t.lower()) if match: p.name = 'h2' if match.group('sub') else 'h1' # re-classify section headings: for i in range(1, 3): for p in soup.find_all('h%s' % i): p.name = 'h%s' % (i + 1, ) for p in soup.find_all('a'): if p.attrs.get('name', '').startswith('OLE_LINK'): p.unwrap() top_level_elements = children(soup.find('div'))[:4] if '.' in self.id: try: assert [e.name for e in top_level_elements ] == ['p', 'p', 'table', 'h3'] except: print top_level_elements[0] print top_level_elements[1] print top_level_elements[3] raise md['title'] = text(top_level_elements[0]) md['authors'] = [ s for s in re.split(',|&| and ', text(top_level_elements[1])) ] remove(*top_level_elements[:3]) refs = soup.find( lambda e: e.name == 'h3' and text(e).startswith('References')) if refs: ex = [] category = None for e, t in tag_and_text(next_siblings(refs)): if e.name == 'p': if t in REFERENCE_CATEGORIES: category = t elif len(t.split()) < 3: raise ValueError(t) else: if 'comment' in e.attrs.get('class', []): if 'refs_comments' not in md: md['refs_comments'] = [t] else: md['refs_comments'].append(t) else: if not YEAR.search(t): print t md['refs'].append( self.get_ref(e, category=category)) ex.append(e) elif e.name in ['h3', 'h4']: category = t ex.append(e) [e.extract() for e in ex + [refs]] for t in soup.find_all('table'): t.wrap(soup.new_tag('div', **{'class': 'table'})) return soup
def test_tag_and_text(self): from souplib import tag_and_text for e, t in tag_and_text(self.soup.a.descendants): assert t
def refactor(self, soup, md): # clean attributes: def update_style(current): style = [] for rule in (current or '').split(';'): rule = rule.strip() # tab-stops:14.2pt text-indent:36.0pt if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']: rule = 'margin-top:0.4em' if normalize_whitespace(rule, repl='') in [ 'font-family:Junicode', 'font-family:JunicodeRegular', ]: continue if rule and not rule.startswith('mso-'): style.append(rule) return ';'.join(style) for e in descendants(soup.find('body')): update_attr(e, 'style', update_style) update_attr(e, 'lang', None) for e, t in tag_and_text( descendants(soup.find('body'), include=['p', 'h1', 'h2']), non_empty=False): if not t: e.extract() for p in soup.find_all('p'): if p.attrs.get('class') == ['Zitat']: p.wrap(soup.new_tag('blockquote')) continue if not p.parent.name == 'td': # need to detect headings by text, too! t = text(p) match = self.heading_pattern.match(t.lower()) if match: p.name = 'h2' if match.group('sub') else 'h1' # re-classify section headings: for i in range(1, 3): for p in soup.find_all('h%s' % i): p.name = 'h%s' % (i + 1,) for p in soup.find_all('a'): if p.attrs.get('name', '').startswith('OLE_LINK'): p.unwrap() top_level_elements = children(soup.find('div'))[:4] if '.' in self.id: try: assert [e.name for e in top_level_elements] == ['p', 'p', 'table', 'h3'] except: print top_level_elements[0] print top_level_elements[1] print top_level_elements[3] raise md['title'] = text(top_level_elements[0]) md['authors'] = [s for s in re.split(',|&| and ', text(top_level_elements[1]))] remove(*top_level_elements[:3]) refs = soup.find(lambda e: e.name == 'h3' and text(e).startswith('References')) if refs: ex = [] category = None for e, t in tag_and_text(next_siblings(refs)): if e.name == 'p': if t in REFERENCE_CATEGORIES: category = t elif len(t.split()) < 3: raise ValueError(t) else: if 'comment' in e.attrs.get('class', []): if 'refs_comments' not in md: md['refs_comments'] = [t] else: md['refs_comments'].append(t) else: if not YEAR.search(t): print t md['refs'].append(self.get_ref(e, category=category)) ex.append(e) elif e.name in ['h3', 'h4']: category = t ex.append(e) [e.extract() for e in ex + [refs]] for t in soup.find_all('table'): t.wrap(soup.new_tag('div', **{'class': 'table'})) return soup