def extractCellResources(content):
    #Extract links from cells and get Wikidata IDs for cell (content)
    bscell = BeautifulSoup(content, "html.parser")
    linksCell = readHTML.readTableCellLinks(bscell)

    if linksCell == None or len(linksCell) == 0:
        return []

    resources = {}
    for i, link in enumerate(linksCell):
        _link = wikiLink(link)
        if _link != None and _link != "":
            wd = wikidataDAO.getWikidataID(_link)
            if wd != "" and wd != None:
                resource = Resource(_link)
                resource.setId(wd)
                resources[_link] = resource
            else:
                resource = Resource(_link)
                resources[_link] = resource
        else:
            resource = Resource("ex: " + _link)
            resources["ex: " + _link] = resource
    #print("List resources:", resources)
    resources = list(resources.values())
    return resources
def formatFeatures(content):
    #Extrac format features from cell (content)
    bullets = 0
    resources = 0
    hasFormat = 0
    multipleLine = 0
    try:
        #print(content)
        bsoup = BeautifulSoup(content)
        #print(bsoup)
        if "<td" in content:
            cell = bsoup.find("td")
        else:
            cell = bsoup.find("th")
        #print(cell)
        links = readHTML.readTableCellLinks(cell)
        # count   the    list
        bullets += len(cell.find_all("ul"))
        # count    the    enumerations
        bullets += len(cell.find_all("ol"))
        # count    font    tags
        hasFormat += len(cell.find_all("font"))
        hasFormat += len(cell.find_all("b"))
        hasFormat += len(cell.find_all("i"))
        hasFormat += len(cell.find_all("th"))
        hasFormat += len(cell.find_all("small"))
        # count    multiple - lines
        multipleLine += multipleLine + len(cell.find_all("br"))
        noLinksText = readHTML.getTagTextNoLinks(cell)
        cspan = cell.get('colspan')
        rspan = cell.get('rowspan')
        if cspan != None:
            cspan = 1
        else:
            cspan = 0
        if rspan != None:
            rspan = 1
        else:
            rspan = 0
        cell.attrs = {}
        text = str(cell)
        length = len(text)

        noLinksText = [s for s in noLinksText.strings if s.strip('\n ') != '']
        noLinksText = " ".join(noLinksText)
        if cspan == 1 or rspan == 1:
            hasSpan = 1
        else:
            hasSpan = 0

        return {
            'length': length,
            'bullets': bullets,
            'hasFormat': hasFormat,
            'multipleLine': multipleLine,
            'noLinksText': len(noLinksText),
            "links": len(links),
            "hasSpan": hasSpan
        }
    except Exception as ex:
        raise Exception("Error html cell")
示例#3
0
def extractLinks(articleTitle, table):
    out = ""
    tarray = np.array(table.htmlMatrix)
    start = table.startRows
    colHeaders = table.colHeaders
    #colHeaders = ["protag_article@3"]
    #colHeaders.extend(table.colHeaders)
    line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(tarray[0])) + \
           "\t" + str(len(tarray) - table.startRows) + "\t"
    prot = wikiLink(articleTitle)
    pwd = wikidataDAO.getWikidataID(prot)
    if pwd == None:
        pwd = ""
    if len(colHeaders) > 1:
        pairLink = {}
        tlinks = [[[] for x in range(tarray.shape[1])]
                  for y in range(len(tarray) - start)]
        rowLink = 0
        for row in range(start, tarray.shape[0]):
            for col in range(tarray.shape[1]):
                contentA = tarray[row][col]
                bscell = BeautifulSoup(contentA, "html.parser")
                linksCell = readHTML.readTableCellLinks(bscell)
                tlinks[rowLink][col] = linksCell
            rowLink += 1
        write = False

        for row in range(len(tlinks)):
            for i in range(len(tlinks[0])):
                nameCol2 = colHeaders[i]
                linksR = tlinks[row][i]
                pos = str(start) + ":" + str(row + start) + ":" + str(
                    -1) + ":" + str(i)
                if len(linksR) == 0:
                    continue
                else:
                    for link in linksR:
                        _link = wikiLink(link)
                        if _link != None and _link != "" and _link != prot:
                            wd = wikidataDAO.getWikidataID(_link)
                            if wd == None:
                                wd = ""
                            props = []
                            if pwd != "" and wd != "":
                                props = wikidataDAO.getRelations(pwd, wd)
                            if len(props) > 0:
                                for p in props:
                                    out += line + pos + "\t" + "protag_article@3"+ "\t" + nameCol2 + "\t" + prot + "\t"+ _link + "\t"+\
                                    pwd+"\t"+wd+"\t"+p+"\n"
                            else:
                                out += line + pos + "\t" + "protag_article@3"+ "\t" + nameCol2 + "\t" + prot + "\t"+ _link + "\t"+\
                                    pwd+"\t"+wd+"\t"+""+"\n"

        for row in range(len(tlinks)):
            for i in range(len(tlinks[0])):
                for j in range(i + 1, len(tlinks[0])):
                    pos = str(start) + ":" + str(row + start) + ":" + str(
                        i) + ":" + str(j)
                    linksL = tlinks[row][i]
                    linksR = tlinks[row][j]
                    if set(linksL) == set(linksR):
                        continue
                    if len(linksL) == 0 or len(linksR) == 0:
                        continue
                    for ll in linksL:
                        for lr in linksR:
                            lla = wikiLink(ll)
                            llb = wikiLink(lr)
                            if lla != "" and llb != "" and lla != llb:
                                wd1 = wikidataDAO.getWikidataID(lla)
                                if wd1 == None:
                                    wd1 = ""
                                wd2 = wikidataDAO.getWikidataID(llb)
                                if wd2 == None:
                                    wd2 = ""
                                props = []
                                if wd1 != "" and wd2 != "":
                                    props = wikidataDAO.getRelations(wd1, wd2)
                                if len(props) > 0:
                                    for p in props:
                                        out += line + pos + "\t" + colHeaders[i] + "\t" + colHeaders[j] + \
                                                       "\t" + lla + "\t" + llb + "\t" + wd1 + "\t" + wd2+"\t"+p+"\n"
                                else:
                                    out += line + pos + "\t" + colHeaders[i] + "\t" + colHeaders[j] + \
                                            "\t" + lla + "\t" + llb + "\t" + wd1 + "\t" + wd2 + "\t" + "" + "\n"
        return out
示例#4
0
def extractLinksGenerator(articleTitle, table):
    out = ""

    tarray = np.array(table.htmlMatrix)
    start = table.startRows
    colHeaders = table.colHeaders
    #colHeaders = ["protag_article@3"]
    #colHeaders.extend(table.colHeaders)
    line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(tarray[0])) + \
           "\t" + str(len(tarray) - table.startRows) + "\t"
    prot = wikiLink(articleTitle)
    pwd = wikidataDAO.getWikidataID(prot)
    if pwd == None:
        pwd = ""
    if len(colHeaders) > 1:
        pairLink = {}
        tlinks = [[[] for x in range(tarray.shape[1])]
                  for y in range(len(tarray) - start)]
        rowLink = 0
        for row in range(start, tarray.shape[0]):
            for col in range(tarray.shape[1]):
                contentA = tarray[row][col]
                bscell = BeautifulSoup(contentA, "html.parser")
                linksCell = readHTML.readTableCellLinks(bscell)
                tlinks[rowLink][col] = linksCell
            rowLink += 1
        write = False

        dictRelByTable = {}
        for i in range(len(tlinks[0])):
            nameCol2 = colHeaders[i]
            dictRelCount = {}
            for row in range(len(tlinks)):
                linksR = tlinks[row][i]
                pos = str(start) + ":" + str(row + start) + ":" + str(
                    -1) + ":" + str(i)
                if len(linksR) == 0:
                    continue
                else:
                    for link in linksR:
                        _link = wikiLink(link)
                        if _link != None and _link != "" and _link != prot:
                            wd = wikidataDAO.getWikidataID(_link)
                            if wd == None:
                                wd = ""
                            props = []
                            if pwd != "" and wd != "":
                                props = wikidataDAO.getRelations(pwd, wd)
                            if len(props) > 0:
                                for p in props:
                                    v = dictRelCount.get(p)
                                    if v == None:
                                        dictRelCount[p] = 1
                                    else:
                                        dictRelCount[p] += 1
                                yield {
                                    cols: "protag_article@3##" + nameCol2,
                                    entity1: prot + " :" + pwd,
                                    entity2: _link + " :" + wd,
                                    relations: props
                                }
            dictRelByTable['protag_article@3##' + nameCol2] = dictRelCount
        for i in range(len(tlinks[0])):
            for j in range(i + 1, len(tlinks[0])):
                nameCol1 = colHeaders[i]
                nameCol2 = colHeaders[j]
                dictRelCount = {}
                for row in range(len(tlinks)):
                    pos = str(start) + ":" + str(row + start) + ":" + str(
                        i) + ":" + str(j)
                    linksL = tlinks[row][i]
                    linksR = tlinks[row][j]
                    if set(linksL) == set(linksR):
                        continue
                    if len(linksL) == 0 or len(linksR) == 0:
                        continue
                    for ll in linksL:
                        for lr in linksR:
                            lla = wikiLink(ll)
                            llb = wikiLink(lr)
                            if lla != "" and llb != "" and lla != llb:
                                wd1 = wikidataDAO.getWikidataID(lla)
                                if wd1 == None:
                                    wd1 = ""
                                wd2 = wikidataDAO.getWikidataID(llb)
                                if wd2 == None:
                                    wd2 = ""
                                props = []
                                if wd1 != "" and wd2 != "":
                                    props = wikidataDAO.getRelations(wd1, wd2)
                                if len(props) > 0:
                                    for p in props:
                                        v = dictRelCount.get(p)
                                        if v == None:
                                            dictRelCount[p] = 1
                                        else:
                                            dictRelCount[p] += 1
                                    yield {
                                        cols: "protag_article@3##" + nameCol2,
                                        entity1: lla + " :" + wd1,
                                        entity2: llb + " :" + wd2,
                                        relations: props
                                    }
                dictRelByTable[nameCol1 + '##' + nameCol2] = dictRelCount
        return out, dictRelByTable
示例#5
0
def extractLinksFromColumns(fileName):
    filenamesplit = fileName.split("/")
    file, file_extension = os.path.splitext(filenamesplit[len(filenamesplit) - 1])
    out = ""
    try:
        if file_extension != ".json":
            return

        file = open(fileName, "r")
        obj = file.read()


        obj = json.loads(obj)
        article = ComplexDecoder().default(obj)
        prot=getTableProtagonist(article.title)
        for table in article.tables:
            tarray = np.array(table.htmlMatrix)
            colHeaders = ["protag_article@3"]
            colHeaders.extend(table.colHeaders)
            rowHeaders = table.rowHeaders
            setrH = set(rowHeaders)

            line = table.tableId + "\t" + str(colHeaders) + "\t" +  str(len(table.htmlMatrix[0])) + \
                   "\t" + str(len(table.htmlMatrix)-table.startRows) + "\t"

            if len(colHeaders) > 1:
                setcH = set(colHeaders)
                if len(setcH) == 1 and "spancol" in colHeaders[0]:
                    continue
                pairLink = {}
                start = table.startRows  # dictTableInf["nRows"] - dictTableInf["nRowHeaders"]
                tlinks=[[[] for x in range(tarray.shape[1])] for y in range(len(tarray)-start)]
                rowLink=0
                for row in range(start, tarray.shape[0]):
                    for col in range(tarray.shape[1]):
                        contentA = tarray[row][col]
                        bscell = BeautifulSoup(contentA, "html.parser")
                        linksCell = readHTML.readTableCellLinks(bscell)
                        tlinks[rowLink][col]=linksCell
                    rowLink+=1
                write = False
                for row in range(len(tlinks)):
                    for i in range(len(tlinks[0])):
                            linksR = tlinks[row][i]
                            pos = str(row) + ":" + str(-1) + ":" + str(i)
                            if len(linksR) == 0:
                                continue
                            else:
                                for link in linksR:
                                    _link= wikiLink(link)
                                    if _link is not None and _link!= "" and _link!=prot:
                                        out += line + pos + "\t" + colHeaders[0] + "\t" +colHeaders[i+1] + "\t" + prot + "\t" + _link + "\n"
                                        write=True
                for row in range(len(tlinks)):
                    for i in range(len(tlinks[0])):
                        for j in range(i+1,len(tlinks[0])):
                            pos = str(row) + ":" + str(i) + ":" + str(j)
                            linksL=tlinks[row][i]
                            linksR=tlinks[row][j]
                            if set(linksL)==set(linksR):
                                continue
                            if len(linksL) == 0 or len(linksR) == 0:
                                continue
                            for ll in linksL:
                                for lr in linksR:
                                    lla = wikiLink(ll)
                                    llb = wikiLink(lr)
                                    if lla != "" and llb != "" and lla!=llb:
                                        out += line + pos + "\t" + colHeaders[i+1] + "\t" + colHeaders[j+1] + "\t" + lla + "\t" + llb + "\n"
                                        write=True

                if not write:
                    out += line + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n"
            else:
                if len(setrH) > 0:
                    if len(setrH) == 1 and "spancol" in table.rowHeaders[0]:
                        continue
                    out += line + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n"
    except:
        print("Error file: ", fileName)
        traceback.print_exc()
    return out