def extractCellResources(content): #Extract links from cells and get Wikidata IDs for cell (content) bscell = BeautifulSoup(content, "html.parser") linksCell = readHTML.readTableCellLinks(bscell) if linksCell == None or len(linksCell) == 0: return [] resources = {} for i, link in enumerate(linksCell): _link = wikiLink(link) if _link != None and _link != "": wd = wikidataDAO.getWikidataID(_link) if wd != "" and wd != None: resource = Resource(_link) resource.setId(wd) resources[_link] = resource else: resource = Resource(_link) resources[_link] = resource else: resource = Resource("ex: " + _link) resources["ex: " + _link] = resource #print("List resources:", resources) resources = list(resources.values()) return resources
def formatFeatures(content): #Extrac format features from cell (content) bullets = 0 resources = 0 hasFormat = 0 multipleLine = 0 try: #print(content) bsoup = BeautifulSoup(content) #print(bsoup) if "<td" in content: cell = bsoup.find("td") else: cell = bsoup.find("th") #print(cell) links = readHTML.readTableCellLinks(cell) # count the list bullets += len(cell.find_all("ul")) # count the enumerations bullets += len(cell.find_all("ol")) # count font tags hasFormat += len(cell.find_all("font")) hasFormat += len(cell.find_all("b")) hasFormat += len(cell.find_all("i")) hasFormat += len(cell.find_all("th")) hasFormat += len(cell.find_all("small")) # count multiple - lines multipleLine += multipleLine + len(cell.find_all("br")) noLinksText = readHTML.getTagTextNoLinks(cell) cspan = cell.get('colspan') rspan = cell.get('rowspan') if cspan != None: cspan = 1 else: cspan = 0 if rspan != None: rspan = 1 else: rspan = 0 cell.attrs = {} text = str(cell) length = len(text) noLinksText = [s for s in noLinksText.strings if s.strip('\n ') != ''] noLinksText = " ".join(noLinksText) if cspan == 1 or rspan == 1: hasSpan = 1 else: hasSpan = 0 return { 'length': length, 'bullets': bullets, 'hasFormat': hasFormat, 'multipleLine': multipleLine, 'noLinksText': len(noLinksText), "links": len(links), "hasSpan": hasSpan } except Exception as ex: raise Exception("Error html cell")
def extractLinks(articleTitle, table): out = "" tarray = np.array(table.htmlMatrix) start = table.startRows colHeaders = table.colHeaders #colHeaders = ["protag_article@3"] #colHeaders.extend(table.colHeaders) line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(tarray[0])) + \ "\t" + str(len(tarray) - table.startRows) + "\t" prot = wikiLink(articleTitle) pwd = wikidataDAO.getWikidataID(prot) if pwd == None: pwd = "" if len(colHeaders) > 1: pairLink = {} tlinks = [[[] for x in range(tarray.shape[1])] for y in range(len(tarray) - start)] rowLink = 0 for row in range(start, tarray.shape[0]): for col in range(tarray.shape[1]): contentA = tarray[row][col] bscell = BeautifulSoup(contentA, "html.parser") linksCell = readHTML.readTableCellLinks(bscell) tlinks[rowLink][col] = linksCell rowLink += 1 write = False for row in range(len(tlinks)): for i in range(len(tlinks[0])): nameCol2 = colHeaders[i] linksR = tlinks[row][i] pos = str(start) + ":" + str(row + start) + ":" + str( -1) + ":" + str(i) if len(linksR) == 0: continue else: for link in linksR: _link = wikiLink(link) if _link != None and _link != "" and _link != prot: wd = wikidataDAO.getWikidataID(_link) if wd == None: wd = "" props = [] if pwd != "" and wd != "": props = wikidataDAO.getRelations(pwd, wd) if len(props) > 0: for p in props: out += line + pos + "\t" + "protag_article@3"+ "\t" + nameCol2 + "\t" + prot + "\t"+ _link + "\t"+\ pwd+"\t"+wd+"\t"+p+"\n" else: out += line + pos + "\t" + "protag_article@3"+ "\t" + nameCol2 + "\t" + prot + "\t"+ _link + "\t"+\ pwd+"\t"+wd+"\t"+""+"\n" for row in range(len(tlinks)): for i in range(len(tlinks[0])): for j in range(i + 1, len(tlinks[0])): pos = str(start) + ":" + str(row + start) + ":" + str( i) + ":" + str(j) linksL = tlinks[row][i] linksR = tlinks[row][j] if set(linksL) == set(linksR): continue if len(linksL) == 0 or len(linksR) == 0: continue for ll in linksL: for lr in linksR: lla = wikiLink(ll) llb = wikiLink(lr) if lla != "" and llb != "" and lla != llb: wd1 = wikidataDAO.getWikidataID(lla) if wd1 == None: wd1 = "" wd2 = wikidataDAO.getWikidataID(llb) if wd2 == None: wd2 = "" props = [] if wd1 != "" and wd2 != "": props = wikidataDAO.getRelations(wd1, wd2) if len(props) > 0: for p in props: out += line + pos + "\t" + colHeaders[i] + "\t" + colHeaders[j] + \ "\t" + lla + "\t" + llb + "\t" + wd1 + "\t" + wd2+"\t"+p+"\n" else: out += line + pos + "\t" + colHeaders[i] + "\t" + colHeaders[j] + \ "\t" + lla + "\t" + llb + "\t" + wd1 + "\t" + wd2 + "\t" + "" + "\n" return out
def extractLinksGenerator(articleTitle, table): out = "" tarray = np.array(table.htmlMatrix) start = table.startRows colHeaders = table.colHeaders #colHeaders = ["protag_article@3"] #colHeaders.extend(table.colHeaders) line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(tarray[0])) + \ "\t" + str(len(tarray) - table.startRows) + "\t" prot = wikiLink(articleTitle) pwd = wikidataDAO.getWikidataID(prot) if pwd == None: pwd = "" if len(colHeaders) > 1: pairLink = {} tlinks = [[[] for x in range(tarray.shape[1])] for y in range(len(tarray) - start)] rowLink = 0 for row in range(start, tarray.shape[0]): for col in range(tarray.shape[1]): contentA = tarray[row][col] bscell = BeautifulSoup(contentA, "html.parser") linksCell = readHTML.readTableCellLinks(bscell) tlinks[rowLink][col] = linksCell rowLink += 1 write = False dictRelByTable = {} for i in range(len(tlinks[0])): nameCol2 = colHeaders[i] dictRelCount = {} for row in range(len(tlinks)): linksR = tlinks[row][i] pos = str(start) + ":" + str(row + start) + ":" + str( -1) + ":" + str(i) if len(linksR) == 0: continue else: for link in linksR: _link = wikiLink(link) if _link != None and _link != "" and _link != prot: wd = wikidataDAO.getWikidataID(_link) if wd == None: wd = "" props = [] if pwd != "" and wd != "": props = wikidataDAO.getRelations(pwd, wd) if len(props) > 0: for p in props: v = dictRelCount.get(p) if v == None: dictRelCount[p] = 1 else: dictRelCount[p] += 1 yield { cols: "protag_article@3##" + nameCol2, entity1: prot + " :" + pwd, entity2: _link + " :" + wd, relations: props } dictRelByTable['protag_article@3##' + nameCol2] = dictRelCount for i in range(len(tlinks[0])): for j in range(i + 1, len(tlinks[0])): nameCol1 = colHeaders[i] nameCol2 = colHeaders[j] dictRelCount = {} for row in range(len(tlinks)): pos = str(start) + ":" + str(row + start) + ":" + str( i) + ":" + str(j) linksL = tlinks[row][i] linksR = tlinks[row][j] if set(linksL) == set(linksR): continue if len(linksL) == 0 or len(linksR) == 0: continue for ll in linksL: for lr in linksR: lla = wikiLink(ll) llb = wikiLink(lr) if lla != "" and llb != "" and lla != llb: wd1 = wikidataDAO.getWikidataID(lla) if wd1 == None: wd1 = "" wd2 = wikidataDAO.getWikidataID(llb) if wd2 == None: wd2 = "" props = [] if wd1 != "" and wd2 != "": props = wikidataDAO.getRelations(wd1, wd2) if len(props) > 0: for p in props: v = dictRelCount.get(p) if v == None: dictRelCount[p] = 1 else: dictRelCount[p] += 1 yield { cols: "protag_article@3##" + nameCol2, entity1: lla + " :" + wd1, entity2: llb + " :" + wd2, relations: props } dictRelByTable[nameCol1 + '##' + nameCol2] = dictRelCount return out, dictRelByTable
def extractLinksFromColumns(fileName): filenamesplit = fileName.split("/") file, file_extension = os.path.splitext(filenamesplit[len(filenamesplit) - 1]) out = "" try: if file_extension != ".json": return file = open(fileName, "r") obj = file.read() obj = json.loads(obj) article = ComplexDecoder().default(obj) prot=getTableProtagonist(article.title) for table in article.tables: tarray = np.array(table.htmlMatrix) colHeaders = ["protag_article@3"] colHeaders.extend(table.colHeaders) rowHeaders = table.rowHeaders setrH = set(rowHeaders) line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(table.htmlMatrix[0])) + \ "\t" + str(len(table.htmlMatrix)-table.startRows) + "\t" if len(colHeaders) > 1: setcH = set(colHeaders) if len(setcH) == 1 and "spancol" in colHeaders[0]: continue pairLink = {} start = table.startRows # dictTableInf["nRows"] - dictTableInf["nRowHeaders"] tlinks=[[[] for x in range(tarray.shape[1])] for y in range(len(tarray)-start)] rowLink=0 for row in range(start, tarray.shape[0]): for col in range(tarray.shape[1]): contentA = tarray[row][col] bscell = BeautifulSoup(contentA, "html.parser") linksCell = readHTML.readTableCellLinks(bscell) tlinks[rowLink][col]=linksCell rowLink+=1 write = False for row in range(len(tlinks)): for i in range(len(tlinks[0])): linksR = tlinks[row][i] pos = str(row) + ":" + str(-1) + ":" + str(i) if len(linksR) == 0: continue else: for link in linksR: _link= wikiLink(link) if _link is not None and _link!= "" and _link!=prot: out += line + pos + "\t" + colHeaders[0] + "\t" +colHeaders[i+1] + "\t" + prot + "\t" + _link + "\n" write=True for row in range(len(tlinks)): for i in range(len(tlinks[0])): for j in range(i+1,len(tlinks[0])): pos = str(row) + ":" + str(i) + ":" + str(j) linksL=tlinks[row][i] linksR=tlinks[row][j] if set(linksL)==set(linksR): continue if len(linksL) == 0 or len(linksR) == 0: continue for ll in linksL: for lr in linksR: lla = wikiLink(ll) llb = wikiLink(lr) if lla != "" and llb != "" and lla!=llb: out += line + pos + "\t" + colHeaders[i+1] + "\t" + colHeaders[j+1] + "\t" + lla + "\t" + llb + "\n" write=True if not write: out += line + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n" else: if len(setrH) > 0: if len(setrH) == 1 and "spancol" in table.rowHeaders[0]: continue out += line + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n" except: print("Error file: ", fileName) traceback.print_exc() return out