def _extract_fields(self, html_str: str) -> List[str]: try: back = htmls.find(html_str, 'div', 'class="di-body"') front = htmls.find(back, 'div', 'class="di-title"') # remove titles back = htmls.removeall(back, 'div', 'class="di-title"') # # remove audios # back = htmls.removeall(back, 'span', 'class="daud"') # support online audios back = re.sub(r'src="/zhs/media', 'src="{}zhs/media'.format(URL_ROOT), back) # remove phrases and idioms back = htmls.removeall(back, 'div', 'class="xref') # seems useless back = htmls.removeall(back, 'div', 'class="cid"') back = htmls.removeall(back, 'div', 'class="dwl hax"') def remove_tag(h): return parse_tag.sub(r'\g<2>', h) # remove links back = htmls.sub(back, remove_tag, 'a', 'class="query"') back = htmls.sub(back, remove_tag, 'a', 'href=') # remove share back = htmls.removeall(back, 'div', 'class="hfr lpb-2"') # remove more examples back = htmls.removeall(back, 'div', 'class="daccord"') # remove js back = htmls.removeall(back, 'script') # remove underlines back = htmls.sub(back, remove_tag, 'span', 'class="x-h dx-h"') # remove adds back = htmls.removeall(back, 'div', 'ad_contentslot') back = htmls.removeall(back, 'div', 'class="bb hax"') # collapse long cards if len(back) > THRESHOLD_COLLAPSE: back = self._collapse(back) return [front, back] except Exception as e: raise ExtractError('can\'t extract fields', e)
def _extract_fields(self, html_str: str) -> List[str]: try: back = htmls.find(html_str, 'div', 'class="di-body"').replace('\n', '') front = htmls.find(back, 'div', 'class="di-title"') # remove titles back = htmls.removeall(back, 'div', 'class="di-title"') # remove audios back = htmls.removeall(back, 'span', 'class="daud"') # remove amp-access back = htmls.removeall(back, 'a', 'amp-access=') def remove_tag(h): return parse_tag.sub(r'\g<2>', h) # remove links back = htmls.sub(back, remove_tag, 'a', 'class="query"') back = htmls.sub(back, remove_tag, 'a', 'href=') # remove share back = htmls.removeall(back, 'div', 'class="hfr lpb-2"') # remove more examples back = htmls.removeall(back, 'div', 'class="daccord"') # remove js back = htmls.removeall(back, 'script') # remove underlines back = htmls.sub(back, remove_tag, 'span', 'class="x-h dx-h"') # remove adds back = htmls.removeall(back, 'div', 'ad_contentslot') back = htmls.removeall(back, 'div', 'class="bb hax"') # collapse long cards if len(back) > THRESHOLD_COLLAPSE: back = self._collapse(back) return [front, back] except Exception as e: raise ExtractError('can\'t extract fields', e)
def collapse1(h): header = htmls.find(htmls.find(h, 'div', 'def-body ddef_b'), 'span', 'trans dtrans dtrans-se') return HTML_COLLAPSE1.format(header, h)
def test_find(self): self.assertEqual('<a href="http://example.org/">example.org</a>', htmls.find(self.HTML, 'a'))