def test_rowspan(self): html = """<html><body><table border="1" class="wikitable"><tr><td>1</td> <td colspan="2">2 and 3</td><td>4</td></tr> <tr><td rowspan="3">5,9 and 13</td> <td>6</td><td>7</td><td>8</td></tr> <tr><td>10</td><td>11</td><td>12</td></tr> <tr><td colspan="3">14,15 and 16</td></tr> </table></body></html>""" soup = BeautifulSoup(html, 'html.parser') soup=readHTML.readTables(soup)[0] listt= readHTML.tableTo2d(soup) table2d=listt[0] table2d = table2d.toHTML() self.assertFalse(table2d is None) result = """<table ><tr><td>1</td> <td>2 and 3</td><td>2 and 3</td><td>4</td></tr> <tr><td>5,9 and 13</td> <td>6</td><td>7</td><td>8</td></tr> <tr><td>5,9 and 13</td><td>10</td><td>11</td><td>12</td></tr> <tr><td>5,9 and 13</td><td>14,15 and 16</td><td>14,15 and 16</td><td>14,15 and 16</td></tr> </table>""".replace(" ", "").replace("\n", "") #resultSoup=BeautifulSoup(table2d,"html.parser") #resultSoup=readHTML.removeSpanAttrs(resultSoup) resultSoup = str(table2d).replace(" ", "").replace("\n", "") self.assertEqual(resultSoup,result)
def testHeaders(self): html="""<table class="wikitable sortable jquery-tablesorter"> <thead></thead><tbody><tr><td>Season</td><td>2017</td> <td>2018</td><td><b>Total</b></td></tr><tr align="center"><td>Wins </td><td>1</td><td>0</td><td><b>1</b></td></tr></tbody><tfoot></tfoot></table>""" soup = BeautifulSoup(html, 'html.parser') tables = readHTML.readTables(soup) listt = readHTML.tableTo2d(tables[0]) table2d = listt[0] headers = readHTML.getMainColHeaders(table2d.htmlMatrix)
def readFile(path): bz_file = bz2.BZ2File(path) soup = BeautifulSoup(bz_file.read(), 'html.parser') title = readHTML.readTitle(soup) tables = readHTML.readTables(soup) tables2d = [] for i, t in enumerate(tables): html, t2d = readHTML.tableTo2d(t) tables2d.append(t2d) article = Article(1, title, tables2d) writeArticle(article)
def getColumnHeaders(tableMatrix): startRows, colHeaders = readHTML.getMainColHeaders(tableMatrix) startRows += 1 if len(set(colHeaders)) == 1 and colHeaders[0] == "": colHeaders = [] else: colHeaders = [h.lower().strip().replace(" ", "_") + "@" + str( readHTML.getColumnType(i, startRows, tableMatrix)) if h != "" else "spancol@" + str( readHTML.getColumnType(i, startRows, tableMatrix)) for i, h in enumerate(colHeaders)] return colHeaders
def test_colspan(self): html = """<!DOCTYPE html><html> <head> <meta charset="utf-8"><meta name="description" content=""><meta name="keywords" content=""> <title>Table Practice</title></head> <body><table class="wikitable" border="1" align="center" cellpadding="10px"> <thead><tr><th rowspan="3">Day</th> <th colspan="3">Seminar</th></tr> <tr><th colspan="2">Schedule</th> <th rowspan="2">Topic</th></tr> <tr><th>Begin</th><th>End</th></tr></thead> <tbody><tr><td rowspan="2">Monday</td> <td rowspan="2">8:00 a.m</td><td rowspan="2">5:00 p.m</td> <td rowspan="">Introduction to XML</td></tr> <tr><td rowspan="">Validity: DTD and Relax NG</td> </tr><tr><td rowspan="4">Tuesday</td> <td>8:00 a.m</td><td>11:00 a.m</td> <td rowspan="2">XPath</td></tr><tr> <td rowspan="2">11:00 a.m</td> <td rowspan="2">2:00 p.m</td> </tr><tr><td rowspan="2">XSL transformation</td></tr> <tr><td>2:00 p.m</td><td>5:00 p.m</td></tr> <tr><td>Wednesday</td><td>8:00 a.m</td><td>12:00 p.m</td><td>XLS Formatting Objects</td> </tr></tbody></table> </body> </html> """ soup = BeautifulSoup(html, 'html.parser') tables = readHTML.readTables(soup) lt = len(tables) self.assertEqual(lt, 1) listt = readHTML.tableTo2d(tables[0]) table2d = listt[0] table2d = table2d.toHTML() self.assertFalse(table2d is None) table2dcontent = table2d.replace(" ", "").replace("\n", "") result = """<table> <tr><th>Day</th><th>Seminar</th><th>Seminar</th><th>Seminar</th></tr> <tr><th>Day</th><th>Schedule</th><th>Schedule</th><th>Topic</th></tr> <tr><th>Day</th><th>Begin</th><th>End</th><th>Topic</th></tr> <tr><td>Monday</td><td>8:00 a.m</td><td>5:00 p.m</td><td>Introduction to XML</td></tr> <tr><td>Monday</td><td>8:00 a.m</td><td>5:00 p.m</td><td>Validity: DTD and Relax NG</td></tr> <tr><td>Tuesday</td><td>8:00 a.m</td><td>11:00 a.m</td><td>XPath</td></tr> <tr><td>Tuesday</td><td>11:00 a.m</td><td>2:00 p.m</td><td>XPath</td></tr> <tr><td>Tuesday</td><td>11:00 a.m</td><td>2:00 p.m</td><td>XSL transformation</td></tr> <tr><td>Tuesday</td><td>2:00 p.m</td><td>5:00 p.m</td><td>XSL transformation</td></tr> <tr><td>Wednesday</td><td>8:00 a.m</td><td>12:00 p.m</td><td>XLS Formatting Objects</td></tr></table>""".replace( " ", "").replace("\n", "") print(table2dcontent) self.assertEqual(table2dcontent, result)
def test_without_span(self): html = """<table border="1" class="wikitable"> <tr><td>1</td><td>2</td><td>3</td></tr> <tr><td>4</td><td>5</td><td>6</td></tr> </table>""" soup = BeautifulSoup(html, 'html.parser') listt = readHTML.tableTo2d(readHTML.readTables(soup)[0]) table2d = listt[0] table2d=table2d.toHTML() self.assertFalse(table2d is None) table2dcontent = table2d.replace(" ", "").replace("\n", "") result = """<table> <tr><td>1</td><td>2 </td><td>3</td></tr> <tr><td>4</td><td>5</td><td>6</td></tr> </table>""".replace(" ", "").replace("\n", "") self.assertEqual(table2dcontent, result)
def extractCellResources(content): #Extract links from cells and get Wikidata IDs for cell (content) bscell = BeautifulSoup(content, "html.parser") linksCell = readHTML.readTableCellLinks(bscell) if linksCell == None or len(linksCell) == 0: return [] resources = {} for i, link in enumerate(linksCell): _link = wikiLink(link) if _link != None and _link != "": wd = wikidataDAO.getWikidataID(_link) if wd != "" and wd != None: resource = Resource(_link) resource.setId(wd) resources[_link] = resource else: resource = Resource(_link) resources[_link] = resource else: resource = Resource("ex: " + _link) resources["ex: " + _link] = resource #print("List resources:", resources) resources = list(resources.values()) return resources
def updateJsonFile(fileName): print("filename: ", fileName) tableId = fileName.split("/")[len(fileName.split("/")) - 1].replace( ".json", "").replace("_", ".") file = open(fileName, "r") obj = file.read() obj = json.loads(obj) # Converting json to Table object table = ComplexDecoderTable().default(obj) if table.tableType == None or table.tableType.value == "": table.setColHeaders([]) table.setStartRows(0) writeTable(table, tableId) return if table.tableType.value != TableType.WELL_FORMED.value: table.setTableType(table.tableType.value) table.setColHeaders([]) table.setStartRows(0) writeTable(table, tableId) else: startRow = table.startRows if startRow == 0: table.setTableType(table.tableType.value) table.setColHeaders([]) table.setStartRows(startRow) writeTable(table, tableId) else: table.setTableType(table.tableType.value) startRows, colHeadersType = readHTML.getColHeaderAllLevels( table.htmlMatrix, table.startRows, textProcessing) table.setColHeaders(colHeadersType) writeTable(table, tableId)
def test_innerTable(self): html = """<table><tbody><tr valign="top"><td><table class="wikitable" style="font-size:95%"><tbody><tr bgcolor="#efefef"> <td colspan="2"><b>Legend</b></td></tr><tr bgcolor="#f3e6d7"><td>Grand Slam</td><td align="center">0</td></tr><tr bgcolor="#ffffcc"> <td>WTA Championships</td><td align="center">0</td></tr><tr bgcolor="#ffcccc"><td>Tier I</td><td align="center">0</td></tr> <tr bgcolor="#ccccff"><td>Tier II</td><td align="center">0</td></tr><tr bgcolor="#CCFFCC"><td>Tier III</td><td align="center">0 </td></tr><tr bgcolor="#66CCFF"><td>Tier IV & V</td><td align="center">0</td></tr></tbody></table></td><td><table class="wikitable" style="font-size:95%"> <tbody><tr bgcolor="#efefef"><td colspan="2"><b>Titles by Surface</b></td></tr><tr><td>Hard</td> <td align="center">0</td></tr><tr><td>Clay</td><td align="center">0</td></tr><tr><td>Grass</td> <td align="center">0</td></tr><tr><td>Carpet</td><td align="center">0</td></tr></tbody></table></td></tr></tbody></table>""" soup = BeautifulSoup(html, 'html.parser') tables = readHTML.readTables(soup) listt = readHTML.tableTo2d(tables[0]) table2d = listt[0] headers=readHTML.getMainColHeaders(table2d.htmlMatrix) self.assertFalse(table2d is None) table2d=table2d.toHTML() table2dcontent = table2d.replace(" ", "").replace("\n", "")
def test_interTitle(self): html = """<table class="x wikitable y" border="1" cellpadding="10px" align="center"><thead> <tr><th rowspan="3">A</th><th colspan="3">B</th></tr> <tr><th colspan="2">C</th><th rowspan="2">F</th></tr> <tr><th>D</th><th>E</th></tr></thead><tbody> <tr><td rowspan="2">1</td><td rowspan="2">a</td><td rowspan="2">b</td><td>x</td></tr> <tr><td>y</td></tr> <tr><td rowspan="4">2</td><td colspan="2">cd</td><td rowspan="2">z</td></tr> <tr><td rowspan="2">e</td><td rowspan="2">f</td></tr> <tr><td rowspan="2">w</td></tr> <tr><td>g</td><td>h</td></tr> <tr><th colspan="4">3</th></tr><tr> <td rowspan="4">4</td><td colspan="3">ijr</td></tr> <tr><td rowspan="2">5</td><td colspan="2">m</td></tr> <tr><td rowspan="2">n</td><td colspan="2">s</td></tr> <tr><td>6</td> <td>t</td></tr></tbody></table>""" soup = BeautifulSoup(html, 'html.parser') tables = readHTML.readTables(soup) lt = len(tables) self.assertEqual(lt, 1) listt = readHTML.tableTo2d(tables[0]) table2d = listt[0] table2d=table2d.toHTML() self.assertFalse(table2d is None) table2dcontent = table2d.replace(" ", "").replace("\n", "") result = """<table> <tr><th>A</th><th>B</th><th>B</th><th>B</th></tr> <tr><th>A</th><th>C</th><th>C</th><th>F</th></tr> <tr><th>A</th><th>D</th><th>E</th><th>F</th></tr> <tr><td>1</td><td>a</td><td>b</td><td>x</td></tr> <tr><td>1</td><td>a</td><td>b</td><td>y</td></tr> <tr><td>2</td><td>cd</td><td>cd</td><td>z</td></tr> <tr><td>2</td><td>e</td><td>f</td><td>z</td></tr> <tr><td>2</td><td>e</td><td>f</td><td>w</td></tr> <tr><td>2</td><td>g</td><td>h</td><td>w</td></tr> <tr><th>3</th><th>3</th><th>3</th><th>3</th></tr> <tr><td >4</td><td >ijr</td><td >ijr</td><td >ijr</td></tr> <tr><td >4</td><td >5</td><td >m</td><td >m</td></tr> <tr><td >4</td><td >5</td><td >n</td><td >s</td></tr> <tr><td >4</td><td >6</td><td >n</td><td >t</td></tr> </table>""".replace( " ", "").replace("\n", "") print(table2dcontent) print(result) self.assertEqual(table2dcontent, result)
def extractTables(filename, folderOut, cont, dictCount): """Extract tables from html.bz, and generate a new file with only tables. :param filename: filename bz file. :param folderOut: folder where generated files will be saved. :param cont: number of file, it will be article ID. :param dictCount: save stats of table types. """ fileNameSplit = filename.split("/") try: file, file_extension = os.path.splitext( fileNameSplit[len(fileNameSplit) - 1]) if "bz2" not in file_extension: return print("[Worker %d] File numer %d" % (os.getpid(), cont)) bzFile = bz2.BZ2File(filename, "rb") soup = BeautifulSoup(bzFile.read(), 'html.parser') title = readHTML.readTitle(soup) tables = readHTML.readTables(soup) html = "<html><head></head><body><h1 class='firstHeading'>{}</h1>".format( title) for t in tables: tableType = tableValidator.validateHTMLTable(t) dictCount[tableType.value] += 1 logging.debug('dictCount: ' + str(dictCount)) if (tableType.value == TableType.WIKITABLE.value or tableType.value == TableType.NO_CSS_CLASS.value or tableType.value == TableType.WITH_INNER_TABLE.value): html += str(t) + "<br/>" dictCount[TableType.USEFULL_TABLE.value] += 1 if "</table>" in html or "</TABLE>" in html: if folderOut.endswith("/"): newFile = bz2.open(folderOut + file + ".bz2", "wt") else: newFile = bz2.open(folderOut + "/" + file + ".bz2", "wt") html += "</body></html>" newFile.write(html) newFile.close() except: try: logging.debug('Error: ' + filename) except: print("Error name file: ", cont) traceback.print_exc()
def testCaption(self): html = """<html><body><table class="wikitable" style="text-align: center;float:right;"> <caption>Irish stadiums in 1999 World Cup</caption><tbody><tr> <td><b>City</b></td><td><b>Stadium</b></td><td><b>Capacity</b> </td></tr><tr><td><span class="flagicon"><a href="/wiki/Republic_of_Ireland" title="Republic of Ireland"><img alt="Republic of Ireland" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/23px-Flag_of_Ireland.svg.png" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/35px-Flag_of_Ireland.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/46px-Flag_of_Ireland.svg.png 2x" data-file-width="1200" data-file-height="600" width="23" height="12"></a></span> <a href="/wiki/Dublin" title="Dublin">Dublin</a></td> <td><a href="/wiki/Lansdowne_Road" title="Lansdowne Road">Lansdowne Road</a></td> <td>49,250</td></tr><tr><td><span class="flagicon"><a href="/wiki/Republic_of_Ireland" title="Republic of Ireland"><img alt="Republic of Ireland" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/23px-Flag_of_Ireland.svg.png" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/35px-Flag_of_Ireland.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/46px-Flag_of_Ireland.svg.png 2x" data-file-width="1200" data-file-height="600" width="23" height="12"></a></span> <a href="/wiki/Limerick" title="Limerick">Limerick</a></td> <td><a href="/wiki/Thomond_Park" title="Thomond Park">Thomond Park</a></td> <td>13,500</td></tr><tr><td><span class="flagicon"><a href="/wiki/United_Kingdom" title="United Kingdom"><img alt="United Kingdom" src="//upload.wikimedia.org/wikipedia/en/thumb/a/ae/Flag_of_the_United_Kingdom.svg/23px-Flag_of_the_United_Kingdom.svg.png" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/en/thumb/a/ae/Flag_of_the_United_Kingdom.svg/35px-Flag_of_the_United_Kingdom.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/a/ae/Flag_of_the_United_Kingdom.svg/46px-Flag_of_the_United_Kingdom.svg.png 2x" data-file-width="1200" data-file-height="600" width="23" height="12"></a></span> <a href="/wiki/Belfast" title="Belfast">Belfast</a></td> <td><a href="/wiki/Ravenhill_Stadium" class="mw-redirect" title="Ravenhill Stadium">Ravenhill Stadium</a></td> <td>12,500</td></tr></tbody></table><html><body>""" soup = BeautifulSoup(html, 'html.parser') tables = readHTML.readTables(soup) listt = readHTML.tableTo2d(tables[0]) table2d = listt[0] headers = readHTML.getMainColHeaders(table2d.htmlMatrix) self.assertFalse(table2d is None) table2dcontent = table2d.toHTML().replace(" ", "").replace("\n", "")
def test_innerEqualTables(self): html ="""<table><tbody><tr><td width="10%" valign="top"><table class="wikitable"> <tbody><tr><th width="150">Pool A</th><th width="15">W</th><th width="15">L</th></tr> <tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Wisconsin" title="Wisconsin"> <img alt="Wisconsin" src="//upload.wikimedia.org/wikipedia/commons/thumb/2/22/Flag_of_Wisconsin.svg/23px-Flag_of_Wisconsin.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/2/22/Flag_of_Wisconsin.svg/35px-Flag_of_Wisconsin.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/2/22/Flag_of_Wisconsin.svg/45px-Flag_of_Wisconsin.svg.png 2x" data-file-width="675" data-file-height="450"></a></span> <a href="/wiki/Erika_Brown" title="Erika Brown">Erika Brown</a></td><td>4</td><td>0</td></tr><tr bgcolor="#ffffcc"><td> <span class="flagicon"><a href="/wiki/Massachusetts" title="Massachusetts"><img alt="Massachusetts" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/23px-Flag_of_Massachusetts.svg.png" width="23" height="14" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/35px-Flag_of_Massachusetts.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/46px-Flag_of_Massachusetts.svg.png 2x" data-file-width="1500" data-file-height="900"> </a></span> <a href="/wiki/Korey_Dropkin" title="Korey Dropkin">Korey Dropkin</a></td><td>3</td><td>1</td></tr> <tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Ontario" title="Ontario"><img alt="Ontario" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/23px-Flag_of_Ontario.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/35px-Flag_of_Ontario.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/46px-Flag_of_Ontario.svg.png 2x" data-file-width="2400" data-file-height="1200"> </a></span> <a href="/w/index.php?title=Ben_Bevan&action=edit&redlink=1" class="new" title="Ben Bevan (page does not exist)">Ben Bevan</a></td><td>2</td><td>2</td></tr><tr><td><span class="flagicon"> <a href="/wiki/Minnesota" title="Minnesota"><img alt="Minnesota" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/23px-Flag_of_Minnesota.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/35px-Flag_of_Minnesota.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/46px-Flag_of_Minnesota.svg.png 2x" data-file-width="500" data-file-height="318"></a></span> <a href="/wiki/Cory_Christensen" title="Cory Christensen">Cory Christensen</a></td><td>1</td><td>3</td></tr><tr><td><span class="flagicon"><a href="/wiki/Pennsylvania" title="Pennsylvania"><img alt="Pennsylvania" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Flag_of_Pennsylvania.svg/23px-Flag_of_Pennsylvania.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Flag_of_Pennsylvania.svg/35px-Flag_of_Pennsylvania.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Flag_of_Pennsylvania.svg/45px-Flag_of_Pennsylvania.svg.png 2x" data-file-width="675" data-file-height="450"></a></span> <a href="/w/index.php?title=Nicholas_Visnich&action=edit&redlink=1" class="new" title="Nicholas Visnich (page does not exist)">Nicholas Visnich</a></td><td>0</td><td>4</td></tr></tbody></table></td><td width="10%" valign="top"><table class="wikitable"><tbody><tr><th width="150">Pool B</th><th width="15">W</th><th width="15">L</th></tr><tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Ontario" title="Ontario"><img alt="Ontario" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/23px-Flag_of_Ontario.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/35px-Flag_of_Ontario.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/46px-Flag_of_Ontario.svg.png 2x" data-file-width="2400" data-file-height="1200"></a></span> <a href="/wiki/Scott_McDonald_(curler)" title="Scott McDonald (curler)">Scott McDonald</a></td><td>4</td><td>0</td></tr><tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Minnesota" title="Minnesota"><img alt="Minnesota" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/23px-Flag_of_Minnesota.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/35px-Flag_of_Minnesota.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/46px-Flag_of_Minnesota.svg.png 2x" data-file-width="500" data-file-height="318"></a></span> <a href="/wiki/Alexandra_Carlson" title="Alexandra Carlson">Alexandra Carlson</a></td><td>2</td><td>2</td></tr><tr bgcolor="#ccffcc"><td><span class="flagicon"><a href="/wiki/California" title="California"><img alt="California" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/01/Flag_of_California.svg/23px-Flag_of_California.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/01/Flag_of_California.svg/35px-Flag_of_California.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/01/Flag_of_California.svg/45px-Flag_of_California.svg.png 2x" data-file-width="900" data-file-height="600"></a></span> <a href="/w/index.php?title=Gabrielle_Coleman&action=edit&redlink=1" class="new" title="Gabrielle Coleman (page does not exist)">Gabrielle Coleman</a></td><td>2</td><td>2</td></tr><tr><td><span class="flagicon"><a href="/wiki/Ontario" title="Ontario"><img alt="Ontario" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/23px-Flag_of_Ontario.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/35px-Flag_of_Ontario.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/46px-Flag_of_Ontario.svg.png 2x" data-file-width="2400" data-file-height="1200"></a></span> <a href="/w/index.php?title=Trevor_Brewer_(curler)&action=edit&redlink=1" class="new" title="Trevor Brewer (curler) (page does not exist)">Trevor Brewer</a></td><td>1</td><td>3</td></tr><tr><td><span class="flagicon"><a href="/wiki/Minnesota" title="Minnesota"><img alt="Minnesota" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/23px-Flag_of_Minnesota.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/35px-Flag_of_Minnesota.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/46px-Flag_of_Minnesota.svg.png 2x" data-file-width="500" data-file-height="318"></a></span> <a href="/w/index.php?title=Ethan_Meyers&action=edit&redlink=1" class="new" title="Ethan Meyers (page does not exist)">Ethan Meyers</a></td><td>1</td><td>3</td></tr></tbody></table></td><td width="10%" valign="top"><table class="wikitable"><tbody><tr><th width="150">Pool C</th><th width="15">W</th><th width="15">L</th></tr><tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Minnesota" title="Minnesota"><img alt="Minnesota" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/23px-Flag_of_Minnesota.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/35px-Flag_of_Minnesota.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/46px-Flag_of_Minnesota.svg.png 2x" data-file-width="500" data-file-height="318"></a></span> <a href="/w/index.php?title=Mark_Haluptzok&action=edit&redlink=1" class="new" title="Mark Haluptzok (page does not exist)">Mark Haluptzok</a></td><td>4</td><td>0</td></tr><tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Indiana" title="Indiana"><img alt="Indiana" src="//upload.wikimedia.org/wikipedia/commons/thumb/a/ac/Flag_of_Indiana.svg/23px-Flag_of_Indiana.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/a/ac/Flag_of_Indiana.svg/35px-Flag_of_Indiana.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/a/ac/Flag_of_Indiana.svg/45px-Flag_of_Indiana.svg.png 2x" data-file-width="750" data-file-height="500"></a></span> <a href="/w/index.php?title=Greg_Eigner&action=edit&redlink=1" class="new" title="Greg Eigner (page does not exist)">Greg Eigner</a></td><td>3</td><td>1</td></tr><tr bgcolor="#ccffcc"><td><span class="flagicon"><a href="/wiki/New_York_(state)" title="New York (state)"><img alt="New York (state)" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Flag_of_New_York.svg/23px-Flag_of_New_York.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Flag_of_New_York.svg/35px-Flag_of_New_York.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Flag_of_New_York.svg/46px-Flag_of_New_York.svg.png 2x" data-file-width="900" data-file-height="450"></a></span> <a href="/w/index.php?title=Joyance_Meechai&action=edit&redlink=1" class="new" title="Joyance Meechai (page does not exist)">Joyance Meechai</a></td><td>2</td><td>2</td></tr><tr><td><span class="flagicon"><a href="/wiki/Massachusetts" title="Massachusetts"><img alt="Massachusetts" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/23px-Flag_of_Massachusetts.svg.png" width="23" height="14" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/35px-Flag_of_Massachusetts.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/46px-Flag_of_Massachusetts.svg.png 2x" data-file-width="1500" data-file-height="900"></a></span> <a href="/w/index.php?title=Stephen_Dropkin&action=edit&redlink=1" class="new" title="Stephen Dropkin (page does not exist)">Stephen Dropkin</a></td><td>1</td><td>3</td></tr><tr><td><span class="flagicon"><a href="/wiki/Ontario" title="Ontario"><img alt="Ontario" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/23px-Flag_of_Ontario.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/35px-Flag_of_Ontario.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/46px-Flag_of_Ontario.svg.png 2x" data-file-width="2400" data-file-height="1200"></a></span> <a href="/w/index.php?title=Gerry_Geurts&action=edit&redlink=1" class="new" title="Gerry Geurts (page does not exist)">Gerry Geurts</a></td><td>0</td><td>4</td></tr></tbody></table></td></tr></tbody></table>""" soup = BeautifulSoup(html, 'html.parser') tables = readHTML.readTables(soup) listt = readHTML.tableTo2d(tables[0]) table2d = listt[0] headers = readHTML.getMainColHeaders(table2d.htmlMatrix) self.assertFalse(table2d is None) table2d = table2d.toHTML() table2dcontent = table2d.replace(" ", "").replace("\n", "")
def createEntityMatrix(file): table = readFile(file) if table is None or table.htmlMatrix is None: return None #if table.htmlMatrix is None: # return None htmlMatrix = np.array(table.htmlMatrix) entityMatrix = htmlMatrix.copy() entityMatrix.fill('') tablem = [] #Fill table headers: for row in range(0, table.startRows): rowm = [] for col in range(htmlMatrix.shape[1]): #entityMatrix[row][col]=htmlMatrix[row][col] rowm.append(readHTML.getTableCellText(htmlMatrix[row][col])) tablem.append(rowm) #Fill table content: for row in range(table.startRows, htmlMatrix.shape[0]): rowm = [] for col in range(htmlMatrix.shape[1]): resources = extractCellResources(htmlMatrix[row][col]) entities = set() for res in resources: entities.add('wd::' + res.id) rowm.append( list(entities) ) #json.dumps(entity.reprJSON(), cls=EntityComplexEncoder, skipkeys=True) tablem.append(rowm) table.htmlMatrix = tablem articleEntity = extractArticleResource(table.articleTitle) if articleEntity is not None: table.setArticleEntity(extractArticleResource(table.articleTitle).id) table.setTableType(table.tableType.value) ft = open( os.path.join(FOLDER_TABLES_OUT, str(table.tableId.replace(".", "_")) + ".json"), "w") ft.write(json.dumps(table.reprJSON(), cls=ComplexEncoder, skipkeys=True)) ft.close()
def extractLinksGenerator(articleTitle, table): out = "" tarray = np.array(table.htmlMatrix) start = table.startRows colHeaders = table.colHeaders #colHeaders = ["protag_article@3"] #colHeaders.extend(table.colHeaders) line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(tarray[0])) + \ "\t" + str(len(tarray) - table.startRows) + "\t" prot = wikiLink(articleTitle) pwd = wikidataDAO.getWikidataID(prot) if pwd == None: pwd = "" if len(colHeaders) > 1: pairLink = {} tlinks = [[[] for x in range(tarray.shape[1])] for y in range(len(tarray) - start)] rowLink = 0 for row in range(start, tarray.shape[0]): for col in range(tarray.shape[1]): contentA = tarray[row][col] bscell = BeautifulSoup(contentA, "html.parser") linksCell = readHTML.readTableCellLinks(bscell) tlinks[rowLink][col] = linksCell rowLink += 1 write = False dictRelByTable = {} for i in range(len(tlinks[0])): nameCol2 = colHeaders[i] dictRelCount = {} for row in range(len(tlinks)): linksR = tlinks[row][i] pos = str(start) + ":" + str(row + start) + ":" + str( -1) + ":" + str(i) if len(linksR) == 0: continue else: for link in linksR: _link = wikiLink(link) if _link != None and _link != "" and _link != prot: wd = wikidataDAO.getWikidataID(_link) if wd == None: wd = "" props = [] if pwd != "" and wd != "": props = wikidataDAO.getRelations(pwd, wd) if len(props) > 0: for p in props: v = dictRelCount.get(p) if v == None: dictRelCount[p] = 1 else: dictRelCount[p] += 1 yield { cols: "protag_article@3##" + nameCol2, entity1: prot + " :" + pwd, entity2: _link + " :" + wd, relations: props } dictRelByTable['protag_article@3##' + nameCol2] = dictRelCount for i in range(len(tlinks[0])): for j in range(i + 1, len(tlinks[0])): nameCol1 = colHeaders[i] nameCol2 = colHeaders[j] dictRelCount = {} for row in range(len(tlinks)): pos = str(start) + ":" + str(row + start) + ":" + str( i) + ":" + str(j) linksL = tlinks[row][i] linksR = tlinks[row][j] if set(linksL) == set(linksR): continue if len(linksL) == 0 or len(linksR) == 0: continue for ll in linksL: for lr in linksR: lla = wikiLink(ll) llb = wikiLink(lr) if lla != "" and llb != "" and lla != llb: wd1 = wikidataDAO.getWikidataID(lla) if wd1 == None: wd1 = "" wd2 = wikidataDAO.getWikidataID(llb) if wd2 == None: wd2 = "" props = [] if wd1 != "" and wd2 != "": props = wikidataDAO.getRelations(wd1, wd2) if len(props) > 0: for p in props: v = dictRelCount.get(p) if v == None: dictRelCount[p] = 1 else: dictRelCount[p] += 1 yield { cols: "protag_article@3##" + nameCol2, entity1: lla + " :" + wd1, entity2: llb + " :" + wd2, relations: props } dictRelByTable[nameCol1 + '##' + nameCol2] = dictRelCount return out, dictRelByTable
def testHeadersMix(self): html=""" <table class="wikitable sortable jquery-tablesorter"> <caption><big>Land surface elevation extremes by country</big><br><br> </caption> <thead><tr> <th width="256px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Country or region </th> <th width="256px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Highest point </th> <th width="84px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Maximum elevation </th> <th width="256px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Lowest point </th> <th width="84px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Minimum elevation </th> <th width="70px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Elevation span </th></tr></thead><tbody> <tr> <td><span class="flagicon"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/23px-Flag_of_Afghanistan.svg.png" decoding="async" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/35px-Flag_of_Afghanistan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/45px-Flag_of_Afghanistan.svg.png 2x" data-file-width="900" data-file-height="600"> </span><a href="/wiki/Afghanistan" title="Afghanistan">Afghanistan</a> </td> <td><a href="/wiki/Noshaq" title="Noshaq">Noshaq</a> </td> <td rowspan="1" align="center"><span style="display:none" data-sort-value="7003749200000000000♠"></span>7492 m<br>24,580 ft </td> <td><a href="/wiki/Amu_Darya" title="Amu Darya">Amu Darya</a> </td> <td rowspan="1" align="center"><span style="display:none" data-sort-value="7002258000000000000♠"></span>258 m<br>846 ft </td> <td rowspan="1" align="center"><span style="display:none" data-sort-value="7003723400000000000♠"></span>7234 m<br>23,734 ft </td></tr> <tr> <td><span class="flagicon"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/3/36/Flag_of_Albania.svg/21px-Flag_of_Albania.svg.png" decoding="async" width="21" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/3/36/Flag_of_Albania.svg/32px-Flag_of_Albania.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/36/Flag_of_Albania.svg/42px-Flag_of_Albania.svg.png 2x" data-file-width="1000" data-file-height="714"> </span><a href="/wiki/Albania" title="Albania">Albania</a> </td> <td><a href="/wiki/Korab_(mountain)" title="Korab (mountain)">Korab</a> </td> <td rowspan="1" align="center"><span style="display:none" data-sort-value="7003276400000000000♠"></span>2764 m<br>9,068 ft </td> <td><a href="/wiki/Adriatic_Sea" title="Adriatic Sea">Adriatic Sea</a> </td> <td rowspan="1" align="center"><span style="display:none" data-sort-value="5000000000000000000♠"></span>sea level </td> <td rowspan="1" align="center"><span style="display:none" data-sort-value="7003276400000000000♠"></span>2764 m<br>9,068 ft </td></tr> <tr> <td><span class="flagicon"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/23px-Flag_of_Algeria.svg.png" decoding="async" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/35px-Flag_of_Algeria.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/45px-Flag_of_Algeria.svg.png 2x" data-file-width="900" data-file-height="600"> </span><a href="/wiki/Algeria" title="Algeria">Algeria</a> </td> <td><a href="/wiki/Mount_Tahat" title="Mount Tahat">Mount Tahat</a> </td> <td rowspan="1" align="center"><span style="display:none" data-sort-value="7003300300000000000♠"></span>3003 m<br>9,852 ft </td> <td><a href="/wiki/Chott_Melrhir" title="Chott Melrhir">Chott Melrhir</a> </td> <td rowspan="1" align="center"><span style="display:none" class="sortkey">2998600000000000000♠</span><span style="color:red">−40 m<br>−131 ft</span> </td> <td rowspan="1" align="center"><span style="display:none" data-sort-value="7003304300000000000♠"></span>3043 m<br>9,984 ft </td></tr> </tbody><tfoot></tfoot></table> """ soup = BeautifulSoup(html, 'html.parser') tables = readHTML.readTables(soup) listt = readHTML.tableTo2d(tables[0]) table2d = listt[0] print(table2d.toHTML()) print(table2d.nrows) assert len(listt)==1 headers = readHTML.getMainColHeaders(table2d.htmlMatrix) print(headers)
def updateJsonFile(fileName): fileNameSplit = fileName.split("/") file, file_extension = os.path.splitext(fileNameSplit[len(fileNameSplit) - 1]) if "json" not in file_extension: return jsonFile = open(fileName, "r") obj = jsonFile.read() #stemmer = SnowballStemmer("english") try: obj = json.loads(obj) article = ComplexDecoder().default(obj) lineTables = "" tables2D = [] out = "" for table in article.tables: if table.tableType == None or table.tableType.value == "": table.setColHeaders([]) table.setStartRows(0) tables2D.append(table) continue if table.tableType.value != TableType.WELL_FORMED.value: table.setTableType(table.tableType.value) table.setColHeaders([]) table.setStartRows(0) tables2D.append(table) continue else: try: startRow, headers = readHTML.getMainColHeaders( table.htmlMatrix) except Exception as ex: table.setTableType(table.tableType.value) table.setColHeaders([]) table.setStartRows(0) tables2D.append(table) continue if startRow == 0: table.setTableType(table.tableType.value) table.setColHeaders([]) table.setStartRows(startRow) tables2D.append(table) continue table.setStartRows(startRow) #startRow = int(table.startRows) matrix = np.array(table.htmlMatrix) listOfLevelHeaders = [] for i in range(startRow): listOfLevelHeaders.append(matrix[i]) headersMatch = [] for row in listOfLevelHeaders: cleanTagHeaders = [] for col in range(len(row)): cell = BeautifulSoup(row[col], "html.parser") cell = readHTML.cleanTableCellTag(cell) text = " ".join( [s for s in cell.strings if s.strip('\n ') != '']) text = text.replace("*", "").replace("@", "") cleanTagHeaders.append(text) cleanTagHeaders = [ textProcessing.cleanCellHeader(h) for h in cleanTagHeaders ] headersMatch.append(cleanTagHeaders) lastRow = headersMatch[len(headersMatch) - 1] headersMatch[len(headersMatch) - 1] = [ 'spancol' if h == '' else h for h in lastRow ] newHeader = [] for col in range(len(headersMatch[0])): textCol = headersMatch[0][col] for row in range(1, len(headersMatch)): textCol += "**" + headersMatch[row][col] newHeader.append(textCol) newHeader = [re.sub('^\\**', '', h) for h in newHeader] if startRow > 1: newHeader = [ h[:-2] if h.endswith("**") else h for h in newHeader ] newHeader = textProcessing.orderHeaders(newHeader) newHeaderType = [] for i, col in enumerate(newHeader): type = readHTML.getColumnType(i, startRow, table.htmlMatrix) newHeaderType.append(newHeader[i] + "@" + str(type)) table.setColHeaders(newHeaderType) table.ncols = len(newHeaderType) table.setTableType(table.tableType.value) tables2D.append(table) try: out = extractLinks2(article.title, table) except Exception as ex1: print("Error links extraction: ", table.tableId) traceback.print_exc() article.setTables(tables2D) #article = Article(articleId=article., title=title, tables=tables2d) f = open(FOLDER_OUT + "/" + file + ".json", "w") f.write( json.dumps(article.reprJSON(), cls=ComplexEncoder, skipkeys=True)) f.close() return out except Exception as ex: print("Error File: ", file) traceback.print_exc()
def extractLinks(articleTitle, table): out = "" tarray = np.array(table.htmlMatrix) start = table.startRows colHeaders = table.colHeaders #colHeaders = ["protag_article@3"] #colHeaders.extend(table.colHeaders) line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(tarray[0])) + \ "\t" + str(len(tarray) - table.startRows) + "\t" prot = wikiLink(articleTitle) pwd = wikidataDAO.getWikidataID(prot) if pwd == None: pwd = "" if len(colHeaders) > 1: pairLink = {} tlinks = [[[] for x in range(tarray.shape[1])] for y in range(len(tarray) - start)] rowLink = 0 for row in range(start, tarray.shape[0]): for col in range(tarray.shape[1]): contentA = tarray[row][col] bscell = BeautifulSoup(contentA, "html.parser") linksCell = readHTML.readTableCellLinks(bscell) tlinks[rowLink][col] = linksCell rowLink += 1 write = False for row in range(len(tlinks)): for i in range(len(tlinks[0])): nameCol2 = colHeaders[i] linksR = tlinks[row][i] pos = str(start) + ":" + str(row + start) + ":" + str( -1) + ":" + str(i) if len(linksR) == 0: continue else: for link in linksR: _link = wikiLink(link) if _link != None and _link != "" and _link != prot: wd = wikidataDAO.getWikidataID(_link) if wd == None: wd = "" props = [] if pwd != "" and wd != "": props = wikidataDAO.getRelations(pwd, wd) if len(props) > 0: for p in props: out += line + pos + "\t" + "protag_article@3"+ "\t" + nameCol2 + "\t" + prot + "\t"+ _link + "\t"+\ pwd+"\t"+wd+"\t"+p+"\n" else: out += line + pos + "\t" + "protag_article@3"+ "\t" + nameCol2 + "\t" + prot + "\t"+ _link + "\t"+\ pwd+"\t"+wd+"\t"+""+"\n" for row in range(len(tlinks)): for i in range(len(tlinks[0])): for j in range(i + 1, len(tlinks[0])): pos = str(start) + ":" + str(row + start) + ":" + str( i) + ":" + str(j) linksL = tlinks[row][i] linksR = tlinks[row][j] if set(linksL) == set(linksR): continue if len(linksL) == 0 or len(linksR) == 0: continue for ll in linksL: for lr in linksR: lla = wikiLink(ll) llb = wikiLink(lr) if lla != "" and llb != "" and lla != llb: wd1 = wikidataDAO.getWikidataID(lla) if wd1 == None: wd1 = "" wd2 = wikidataDAO.getWikidataID(llb) if wd2 == None: wd2 = "" props = [] if wd1 != "" and wd2 != "": props = wikidataDAO.getRelations(wd1, wd2) if len(props) > 0: for p in props: out += line + pos + "\t" + colHeaders[i] + "\t" + colHeaders[j] + \ "\t" + lla + "\t" + llb + "\t" + wd1 + "\t" + wd2+"\t"+p+"\n" else: out += line + pos + "\t" + colHeaders[i] + "\t" + colHeaders[j] + \ "\t" + lla + "\t" + llb + "\t" + wd1 + "\t" + wd2 + "\t" + "" + "\n" return out
def formatFeatures(content): #Extrac format features from cell (content) bullets = 0 resources = 0 hasFormat = 0 multipleLine = 0 try: #print(content) bsoup = BeautifulSoup(content) #print(bsoup) if "<td" in content: cell = bsoup.find("td") else: cell = bsoup.find("th") #print(cell) links = readHTML.readTableCellLinks(cell) # count the list bullets += len(cell.find_all("ul")) # count the enumerations bullets += len(cell.find_all("ol")) # count font tags hasFormat += len(cell.find_all("font")) hasFormat += len(cell.find_all("b")) hasFormat += len(cell.find_all("i")) hasFormat += len(cell.find_all("th")) hasFormat += len(cell.find_all("small")) # count multiple - lines multipleLine += multipleLine + len(cell.find_all("br")) noLinksText = readHTML.getTagTextNoLinks(cell) cspan = cell.get('colspan') rspan = cell.get('rowspan') if cspan != None: cspan = 1 else: cspan = 0 if rspan != None: rspan = 1 else: rspan = 0 cell.attrs = {} text = str(cell) length = len(text) noLinksText = [s for s in noLinksText.strings if s.strip('\n ') != ''] noLinksText = " ".join(noLinksText) if cspan == 1 or rspan == 1: hasSpan = 1 else: hasSpan = 0 return { 'length': length, 'bullets': bullets, 'hasFormat': hasFormat, 'multipleLine': multipleLine, 'noLinksText': len(noLinksText), "links": len(links), "hasSpan": hasSpan } except Exception as ex: raise Exception("Error html cell")
def extractLinksFromColumns(fileName): filenamesplit = fileName.split("/") file, file_extension = os.path.splitext(filenamesplit[len(filenamesplit) - 1]) out = "" try: if file_extension != ".json": return file = open(fileName, "r") obj = file.read() obj = json.loads(obj) article = ComplexDecoder().default(obj) prot=getTableProtagonist(article.title) for table in article.tables: tarray = np.array(table.htmlMatrix) colHeaders = ["protag_article@3"] colHeaders.extend(table.colHeaders) rowHeaders = table.rowHeaders setrH = set(rowHeaders) line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(table.htmlMatrix[0])) + \ "\t" + str(len(table.htmlMatrix)-table.startRows) + "\t" if len(colHeaders) > 1: setcH = set(colHeaders) if len(setcH) == 1 and "spancol" in colHeaders[0]: continue pairLink = {} start = table.startRows # dictTableInf["nRows"] - dictTableInf["nRowHeaders"] tlinks=[[[] for x in range(tarray.shape[1])] for y in range(len(tarray)-start)] rowLink=0 for row in range(start, tarray.shape[0]): for col in range(tarray.shape[1]): contentA = tarray[row][col] bscell = BeautifulSoup(contentA, "html.parser") linksCell = readHTML.readTableCellLinks(bscell) tlinks[rowLink][col]=linksCell rowLink+=1 write = False for row in range(len(tlinks)): for i in range(len(tlinks[0])): linksR = tlinks[row][i] pos = str(row) + ":" + str(-1) + ":" + str(i) if len(linksR) == 0: continue else: for link in linksR: _link= wikiLink(link) if _link is not None and _link!= "" and _link!=prot: out += line + pos + "\t" + colHeaders[0] + "\t" +colHeaders[i+1] + "\t" + prot + "\t" + _link + "\n" write=True for row in range(len(tlinks)): for i in range(len(tlinks[0])): for j in range(i+1,len(tlinks[0])): pos = str(row) + ":" + str(i) + ":" + str(j) linksL=tlinks[row][i] linksR=tlinks[row][j] if set(linksL)==set(linksR): continue if len(linksL) == 0 or len(linksR) == 0: continue for ll in linksL: for lr in linksR: lla = wikiLink(ll) llb = wikiLink(lr) if lla != "" and llb != "" and lla!=llb: out += line + pos + "\t" + colHeaders[i+1] + "\t" + colHeaders[j+1] + "\t" + lla + "\t" + llb + "\n" write=True if not write: out += line + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n" else: if len(setrH) > 0: if len(setrH) == 1 and "spancol" in table.rowHeaders[0]: continue out += line + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n" except: print("Error file: ", fileName) traceback.print_exc() return out
def normalizeTables(filename): file = filename.split("##$##")[0] cont = int(filename.split("##$##")[1]) print("cont: ", cont) try: bzFile = bz2.BZ2File(file, "rb") soup = BeautifulSoup(bzFile.read(), 'html.parser') bzFile.close() except: print("Error reading file: ", filename) return str(cont) + "0\t0\t0\t0\t0\t0\n" dictStat = {} dictStat[TableType.ILL_FORMED.value] = 0 dictStat["NO_PROCESSED"] = 0 dictStat[TableType.WELL_FORMED.value] = 0 dictStat[TableType.SMALLTABLE.value] = 0 dictStat[TableType.WITH_INNER_TABLE.value] = 0 dictStat[TableType.FORMAT_BOX.value] = 0 try: title = readHTML.readTitle(soup) tables = readHTML.readTables(soup) tables2d = [] contTables = 1 formatTables = 0 for it, t in enumerate(tables): try: parents = [p.name for p in t.findParents()] if t.parent != None and ("th" in parents or "td" in parents or "tr" in parents): continue start = time.time() listt2d = readHTML.tableTo2d(t) logging.debug("Time reading table: " + str(time.time() - start)) validTables = [] if listt2d == None or len(listt2d) == 0: newTable = readHTML.saveIllTable( t, TableType.ILL_FORMED.value) if newTable != None: validTables.append(newTable) dictStat[TableType.ILL_FORMED.value] += 1 else: dictStat["NO_PROCESSED"] += 1 else: if len(listt2d) > 10: validTables.append(newTable) dictStat[TableType.ILL_FORMED.value] += 1 continue for t2d in listt2d: if t2d.tableType == TableType.FORMAT_BOX.value: dictStat[TableType.FORMAT_BOX.value] += 1 formatTables += 1 continue if t2d.tableType == TableType.SMALLTABLE.value: dictStat[TableType.SMALLTABLE.value] += 1 continue if t2d.tableType == TableType.ILL_FORMED.value: dictStat[TableType.ILL_FORMED.value] += 1 validTables.append(t2d) continue if t2d.tableType == TableType.WITH_INNER_TABLE.value: dictStat[TableType.WITH_INNER_TABLE.value] += 1 validTables.append(t2d) continue #print(t2d.toHTML()) validTables.append(t2d) dictStat[TableType.WELL_FORMED.value] += 1 for t2d in validTables: tableId = str(cont) + "." + str(contTables) t2d.setTableId(tableId) tables2d.append(t2d) contTables += 1 except: traceback.print_exc() print("Error: ", filename, it) continue if len(tables2d) > 0: article = Article(articleId=str(cont), title=title, tables=tables2d) f = open(FOLDER_OUT + "/" + str(cont) + ".json", "w") f.write( json.dumps(article.reprJSON(), cls=ComplexEncoder, skipkeys=True)) f.close() else: if len(tables) == formatTables: logging.debug("Format table: " + filename) else: logging.debug("Error none useful table: " + filename) logging.debug(dictStat) except: traceback.print_exc() logging.debug("Error file ", filename) return str(cont)+"\t"+ str(dictStat[TableType.ILL_FORMED.value])+"\t"+ \ str(dictStat["NO_PROCESSED"])+"\t"+ \ str(dictStat[TableType.WELL_FORMED.value])+"\t"+ \ str(dictStat[TableType.SMALLTABLE.value])+"\t" + \ str(dictStat[TableType.WITH_INNER_TABLE.value])+"\t" + \ str(dictStat[TableType.FORMAT_BOX.value])+"\n"