예제 #1
0
def updateJsonFile(fileName):
    print("filename: ", fileName)
    tableId = fileName.split("/")[len(fileName.split("/")) - 1].replace(
        ".json", "").replace("_", ".")
    file = open(fileName, "r")
    obj = file.read()
    obj = json.loads(obj)
    # Converting json to Table object
    table = ComplexDecoderTable().default(obj)

    if table.tableType == None or table.tableType.value == "":
        table.setColHeaders([])
        table.setStartRows(0)
        writeTable(table, tableId)
        return
    if table.tableType.value != TableType.WELL_FORMED.value:
        table.setTableType(table.tableType.value)
        table.setColHeaders([])
        table.setStartRows(0)
        writeTable(table, tableId)
    else:
        startRow = table.startRows
        if startRow == 0:
            table.setTableType(table.tableType.value)
            table.setColHeaders([])
            table.setStartRows(startRow)
            writeTable(table, tableId)
        else:
            table.setTableType(table.tableType.value)
            startRows, colHeadersType = readHTML.getColHeaderAllLevels(
                table.htmlMatrix, table.startRows, textProcessing)
            table.setColHeaders(colHeadersType)
            writeTable(table, tableId)
def createTables(filename):
    file=filename.split("##$##")[0]
    fileNamebz=file.split("/")[len(file.split("/"))-1]
    cont=int(filename.split("##$##")[1])
    print("cont: ",cont)
    try:
        bzFile = bz2.BZ2File(file, "rb")
        soup = BeautifulSoup(bzFile.read(), 'html.parser')
        bzFile.close()
    except:
        print("Error reading file: ", filename)
        return str(cont)+"0\t0\t0\t0\t0\t0\n"
    dictStat={}
    dictStat[TableType.ILL_FORMED.value]=0
    dictStat["NO_PROCESSED"] = 0
    dictStat[TableType.WELL_FORMED.value] = 0
    dictStat[TableType.SMALLTABLE.value] = 0
    dictStat[TableType.WITH_INNER_TABLE.value] = 0
    dictStat[TableType.FORMAT_BOX.value] = 0
    try:
        title = readHTML.readTitle(soup)
        tables = readHTML.readTables(soup)
        tables2d = []
        contTables = 1
        formatTables = 0
        for it, t in enumerate(tables):
            try:
                parents = [p.name for p in t.findParents()]
                if t.parent != None and ("th" in parents or "td" in parents or "tr" in parents):
                    continue
                start=time.time()
                listt2d = readHTML.tableTo2d(t)
                logging.debug("Time reading table: "+ str(time.time()-start))
                validTables = []
                if listt2d == None or len(listt2d) == 0:

                    newTable = readHTML.saveIllTable(t, TableType.ILL_FORMED.value)
                    if newTable != None:
                        validTables.append(newTable)
                        dictStat[TableType.ILL_FORMED.value] += 1
                    else:
                        dictStat["NO_PROCESSED"] += 1

                else:
                    if len(listt2d)>10:
                        validTables.append(newTable)
                        dictStat[TableType.ILL_FORMED.value] += 1
                        continue
                    for t2d in listt2d:
                        if t2d.tableType == TableType.FORMAT_BOX.value:
                            dictStat[TableType.FORMAT_BOX.value] += 1
                            formatTables += 1
                            continue

                        if t2d.tableType == TableType.SMALLTABLE.value:
                            dictStat[TableType.SMALLTABLE.value] += 1
                            continue

                        if t2d.tableType == TableType.ILL_FORMED.value:
                            dictStat[TableType.ILL_FORMED.value] += 1
                            validTables.append(t2d)
                            continue

                        if t2d.tableType == TableType.WITH_INNER_TABLE.value:
                            dictStat[TableType.WITH_INNER_TABLE.value] += 1
                            validTables.append(t2d)
                            continue
                        #print(t2d.toHTML())
                        startRows, colHeaders=readHTML.getMainColHeaders(t2d.htmlMatrix)
                        if startRows>0:
                            startRows,colHeadersType=readHTML.getColHeaderAllLevels(t2d.htmlMatrix, startRows, textProcessing)
                            t2d.setStartRows(startRows)
                            t2d.setColHeaders(colHeadersType)
                            t2d.ncols=len(colHeadersType)
                        validTables.append(t2d)
                        dictStat[TableType.WELL_FORMED.value] += 1

                for t2d in validTables:
                    tableId = str(cont) + "." + str(contTables)
                    t2d.setTableId(tableId)
                    t2d.setArticleId(str(cont))
                    t2d.setArticleTitle(title)
                    t2d.setArticlePath(fileNamebz)
                    tables2d.append(t2d)
                    ft = open(os.path.join(FOLDER_TABLES_OUT,str(tableId.replace(".","_")) + ".json"), "w")
                    ft.write(json.dumps(t2d.reprJSON(), cls=ComplexEncoder, skipkeys=True))
                    ft.close()

                    contTables += 1
            except:
                traceback.print_exc()
                print("Error: ", filename, it)
                continue
        if len(tables2d) > 0:
            article = Article(articleId=str(cont), title=title, tables=tables2d)
            f = open(FOLDER_OUT + "/" + str(cont) + ".json", "w")
            f.write(json.dumps(article.reprJSON(), cls=ComplexEncoder, skipkeys=True))
            f.close()
        else:
            if len(tables) == formatTables:
                logging.debug("Format table: " + filename)
            else:
                logging.debug("Error none useful table: " + filename)
        logging.debug(dictStat)
    except:
        traceback.print_exc()
        logging.debug("Error file ", filename)

    return str(cont)+"\t"+  str(dictStat[TableType.ILL_FORMED.value])+"\t"+ \
        str(dictStat["NO_PROCESSED"])+"\t"+ \
        str(dictStat[TableType.WELL_FORMED.value])+"\t"+ \
        str(dictStat[TableType.SMALLTABLE.value])+"\t" + \
        str(dictStat[TableType.WITH_INNER_TABLE.value])+"\t" + \
        str(dictStat[TableType.FORMAT_BOX.value])+"\n"