def test_url(self): url = testdata.get_url() v = Q(url) self.assertTrue(v.is_url()) u = v.url() self.assertTrue(url.endswith(u.netloc))
def test_no_headers(self): html = [ "<table>", " <tr>", " <td>Column 1</td>", " <td>Column 2</td>", " <td>Column 3</td>", " <td>Column 4</td>", " </tr>", "</table>", ] t = Table(testdata.get_url(), "\n".join(html)) t.parse() result = { '0': { "headers": [], "value": "Column 1", }, '1': { "headers": [], "value": "Column 2", }, '2': { "headers": [], "value": "Column 3", }, '3': { "headers": [], "value": "Column 4", }, } self.assertEqual(result, t.tables[0]["rows"][0])
def test_tables6(self): html = self.get_html("tables6") t = Table(testdata.get_url(), html) t.parse() self.assertEqual(5, len(t.fields["tables"][0]["rows"])) for row in t.fields["tables"][0]["rows"]: self.assertEqual(4, len(row)) for v in ["1 header", "1 subheader"]: for row in t.fields["tables"][0]["rows"][0].values(): self.assertTrue(v in row["headers"]) for v in ["2 header", "2 subheader"]: for row in t.fields["tables"][0]["rows"][1].values(): self.assertTrue(v in row["headers"]) for v in ["2 header", "3 subheader"]: for row in t.fields["tables"][0]["rows"][2].values(): self.assertTrue(v in row["headers"]) for v in ["4 header", "4 subheader"]: for row in t.fields["tables"][0]["rows"][3].values(): self.assertTrue(v in row["headers"])
def test_get_url(self): s = testdata.get_url() self.assertNotEqual("", s) if is_py2: self.assertRegexpMatches(s, r'https?\://\S*') else: self.assertRegex(s, r'https?\://\S*')
def test_digital(self): nit = Item(price=100, body={ "url": testdata.get_url(), "title": testdata.get_words(), "digital": True, "price": 1.0 }, uuid="foo") self.assertTrue(" (digital)" in nit.title)
def test_new_old_price(self): uuid = testdata.get_hash() body = { "url": testdata.get_url(), "title": testdata.get_words(), } oit = WatchlistItem.create(price=10, body=dict(body), uuid=uuid) it = Item(price=1, body=dict(body), uuid=uuid) s = it.html_detail() self.assertTrue("<b>$1.00</b>" in s) self.assertTrue("was <b>$10.00</b>" in s)
def test_image_col(self): html = [ "<table>", " <tr>", " <td><img src=\"https://foo.com/image.jpg\" /></td>", " <td>Column 2</td>", " </tr>", "</table>", ] t = Table(testdata.get_url(), "\n".join(html)) t.parse() self.assertEqual("https://foo.com/image.jpg", t.tables[0]["rows"][0]["0"]["value"])
def test_dimensions(self): html = self.get_html("tables4") t = Table(testdata.get_url(), html) tables = t.soup.find_all("table") cols, rows = t.find_dimensions(tables[2]) self.assertEqual((4, 3), (cols, rows)) cols, rows = t.find_dimensions(tables[3]) self.assertEqual((3, 2), (cols, rows)) cols, rows = t.find_dimensions(tables[1]) self.assertEqual((2, 1), (cols, rows)) cols, rows = t.find_dimensions(tables[5]) self.assertEqual((6, 2), (cols, rows)) html = self.get_html("tables3") t = Table(testdata.get_url(), html) cols, rows = t.find_dimensions(t.soup.find_all("table")[1]) self.assertEqual((3, 55), (cols, rows))
def test_row_th(self): """This was one of the test tables in the docs, it has a td in the header row and a th in the content row, the parser needs to handle these cases this was my original note: we need to make sure it can parse a table that has a th and td in the same tr """ html = [ '<table>', ' <tr>', ' <td> </td>', ' <th scope="col">Batman</th>', ' <th scope="col">Robin</th>', ' <th scope="col">The Flash</th>', ' <th scope="col">Kid Flash</th>', ' </tr>', ' <tr>', ' <th scope="row">Skill</th>', ' <td>Smarts</td>', ' <td>Dex, acrobat</td>', ' <td>Super speed</td>', ' <td>Super speed</td>', ' </tr>', '</table>', ] t = Table(testdata.get_url(), "\n".join(html)) t.parse() result = { '0': { "headers": [], "value": "Skill", }, 'Batman': { "headers": [], "value": "Smarts" }, 'Robin': { "headers": [], "value": "Dex, acrobat", }, 'The Flash': { "headers": [], "value": "Super speed" }, 'Kid Flash': { "headers": [], "value": "Super speed" } } self.assertEqual(result, t.tables[0]["rows"][0])
def get_item(item=None, **kwargs): if item: body = dict(item.newest.body) body.update(kwargs) body.setdefault("uuid", item.uuid) kwargs = body price = kwargs.pop("price", testdata.get_int(1000)) uuid = kwargs.pop("uuid", testdata.get_hash()) kwargs.setdefault("url", testdata.get_url()) kwargs.setdefault("digital", testdata.get_bool()) kwargs.setdefault("image", testdata.get_url()) kwargs.setdefault("title", testdata.get_words()) if isinstance(price, float): kwargs["price"] = price price = int(price * 100.0) else: kwargs["price"] = float(price) * 0.01 it = Item(price=price, body=kwargs, uuid=uuid) return it
def test_header_error(self): #html = self.get_html("tables_wikipedia1") # t = Table(testdata.get_url(), html) # t.parse() # pout.v(t.tables[1]) col_count = 13 html = self.get_html("tables7") t = Table(testdata.get_url(), html) t.parse() table = t.tables[0] self.assertTrue(table.caption) for r in table: #pout.v(r["Icon"], r["Emoji"], r["Meaning"]) self.assertEqual(col_count, len(list(r.columns()))) self.assertTrue(r.get("Icon"))
def test_dl(self): url = testdata.get_url() html = "\n".join([ "<dl>", " <dt>term1</dt>", " <dd>definition 1</dd>", " <dt>term2</dt>", " <dd>definition 2</dd>", "</dl>", "<dl>", " <dt>term3</dt>", " <dd>definition 3</dd>", " <dt>term4</dt>", " <dd>definition 4</dd>", "</dl>", ]) t = Table(url, html) t.parse() self.assertEqual(2, len(t.fields["dls"])) self.assertEqual(2, len(t.fields["dls"][0])) self.assertEqual(2, len(t.fields["dls"][1])) html = "\n".join([ "<dl>", " <dt>term1</dt>", " <dt>term2</dt>", " <dd>definition 1</dd>", " <dt>term3</dt>", " <dd>definition 2</dd>", "</dl>", ]) t = Table(url, html) t.parse() self.assertEqual(1, len(t.fields["dls"])) self.assertEqual(2, len(t.fields["dls"][0]))
def test_get_url(self): s = testdata.get_url() self.assertNotEqual(u"", s) self.assertRegexpMatches(s, 'https?\://\S*')
def test_content(self): html = self.get_html("tables4") t = Table(testdata.get_url(), html) tables = t.soup.find_all("table") tests = { 0: [{ 'Last name': { "headers": [], "value": "Doe", }, 'First name': { "headers": [], "value": "John" } }, { 'Last name': { "headers": [], "value": "Doe", }, 'First name': { "headers": [], "value": "Jane" } }], 1: [{ 'Header content 1': { "headers": [], "value": "Body content 1", }, 'Header content 2': { "headers": [], "value": "Body content 2" }, }, { 'Header content 1': { "headers": [], "value": "Footer content 1", }, 'Header content 2': { "headers": [], "value": "Footer content 2" }, }], 6: [{ 'header 2': { "headers": [], "value": "Content 1.2", }, 'header 1': { "headers": [], "value": "Content 1.1" }, }, { 'header 2': { "headers": [], "value": "Content 2.2", }, 'header 1': { "headers": [], "value": "Content 2.1" }, }] } for i, r in tests.items(): ret = t.find_table(tables[i]) self.assertEqual(r, ret["rows"])