def test_re_field_get_nothing_with_no_default(): field = RegexField(re_select='nothing to match.') try: field.extract(html=HTML) raise AssertionError except NothingMatchedError: pass
def test_re_field_in_dict_format_with_many(): field = RegexField(re_select='<a class="test_link" href="(?P<href>.*?)">(?P<text>.*?)</a>', many=True) matches = field.extract(html=HTML) assert len(matches) == 5 assert matches[0]['href'] == 'https://github.com/howie6879/' assert matches[0]['text'] == 'hello1 github.' assert matches[4]['href'] == 'https://github.com/howie6879/' assert matches[4]['text'] == 'hello5 github.'
def test_re_field_with_many(): field = RegexField(re_select='<a class="test_link" href="(.*?)">(.*?)</a>', many=True) matches = field.extract(html=HTML) assert len(matches) == 5 href0, text0 = matches[0] href4, text4 = matches[4] assert href0 == 'https://github.com/howie6879/' assert text0 == 'hello1 github.' assert href4 == 'https://github.com/howie6879/' assert text4 == 'hello5 github.'
def test_re_field_with_html_element(): field = RegexField( re_select='<h1><a href="(?P<href>.*?)">(?P<text>.*?)</a></h1>') result = field.extract(html=html_etree) assert result["href"] == "https://github.com" assert result["text"] == "Github"
def test_re_field_get_nothing_with_no_default(): field = RegexField(re_select="nothing to match.") try: field.extract(html=HTML) except Exception as e: assert isinstance(e, NothingMatchedError)
def test_re_field_with_default(): field = RegexField(re_select="nothing to match.", default="default value") result = field.extract(html=HTML) assert result == "default value"
def test_re_field_with_many_groups(): field = RegexField(re_select='<h1><a href="(.*?)">(.*?)</a></h1>') href, text = field.extract(html=HTML) assert href == "https://github.com" assert text == "Github"
def test_re_field_with_no_group(): field = RegexField(re_select="<title>.*?</title>") href = field.extract(html=HTML) assert href == "<title>ruia</title>"
def test_re_field_with_one_group(): field = RegexField(re_select="<title>(.*?)</title>") href = field.extract(html=HTML) assert href == "ruia"
def test_re_field_with_html_element(): field = RegexField( re_select='<h1><a href="(?P<href>.*?)">(?P<text>.*?)</a></h1>') result = field.extract(html=html_etree) assert result['href'] == 'https://github.com' assert result['text'] == 'Github'