Exemplo n.º 1
0
def TrainUsingCRF(xmls, preprocessor, trainer):
    CRFImpl = CRF()
    annotatedxmllist = list()
    for xmlname in xmls:
        fontdict = preprocessor.getFontDictionary(ET.parse("../TrainingData/xmls/cs/" + xmlname + ".xml")) #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>)
        annotatedxml = trainer.readAnnotatedXml('../TrainingData/annotated/' + xmlname + "_annotated")
        annotatedxmllist.append([annotatedxml, fontdict])
    CRFImpl.domaintrain(annotatedxmllist)
    f = open("TrainedWeightsCRF", 'w')
    for weight in CRFImpl.trainedweights:
        f.write(str(weight) + "\n")
    
    f.close()
Exemplo n.º 2
0
def TestUsingCRF(predictxmlname, location):
    CRF = getModelwithTrainedWeights()
    fontdict = preprocessor.getFontDictionary(ET.parse(location + predictxmlname + ".xml"))                  
    preprocessedxml = preprocessor.preprocessxml(location + predictxmlname + ".xml") #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>)
    alltables = list()
    for page in preprocessedxml:
        for col in page:
            if(len(col) < 2):
                    continue
            for lineno in xrange(len(col)):
                col[lineno].append(lineno)
            predicted = CRF.predict(col, fontdict)
            for r in predicted:
#                if(r[0] == SparseType.OTHERSPARSE):
                    print r[1].text + " *** Line no *** " + str(r[2]) + " --  " + str(r[0])
            data = postprocessor.findTables(predicted)
            tables = data
            if(len(tables) == 0):
                continue
            for t in tables:
                alltables.append(t)
    
    for table in alltables:
        print "============================================="
        for row in table:
            print row[1].text + " " + str(row[0]) 
def TestUsingCRF(predictxmlname, location, TDsvm=None):
    CRF = getModelwithTrainedWeights()
    fontdict = preprocessor.getFontDictionary(
        ET.parse(location + predictxmlname + ".xml"))
    preprocessedxml = preprocessor.preprocessxml(
        location + predictxmlname + ".xml"
    )  #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>)

    alltables = list()
    errorcount = 0
    sparseerror = 0
    ntlafterpostproc = 0
    for page in preprocessedxml:
        for col in page:
            if (len(col) < 2):
                continue
            for tup in col:
                if (tup[1].text is None or tup[1].text.strip() == ''):
                    col.remove(tup)
            for lineno in xrange(len(col)):
                col[lineno].append(lineno)

            result = CRF.predict(col, fontdict)
            predicted = result[0]
            errorcount += result[1]
            sparseerror += result[2]
            #            for r in predicted:
            #                if(r[0] == SparseType.OTHERSPARSE):
            #                    print r[1].text.encode('ascii','ignore') + " *** Line no *** " + str(r[2])
            data = postprocessor.findTables(predicted)
            tables = data
            if (len(tables) == 0):
                continue
            for t in tables:
                alltables.append(t)

    if TDsvm is None:
        for table in alltables:
            print "============================================="
            for row in table:
                if (int(row[0]) == SparseType.NONTABLELINE):
                    ntlafterpostproc += 1
                print row[1].text.encode('ascii', 'ignore')
            print "=============================================="

    else:
        for t in alltables:
            predicted = TDsvm.domainpredictforTableDecomposition(t)
            print "=============================================="
            for r in predicted[0]:
                if (r[0] == SparseType.HEADER):
                    print r[1].text + "  ---> HEADER "
                else:
                    print r[1].text + "  ---> DATA "
            print "=============================================="

    return [errorcount, sparseerror, ntlafterpostproc]
def TrainUsingCRF(xmls, preprocessor, trainer, xmlloc, annotatedxmlloc):
    CRFImpl = CRF()
    annotatedxmllist = list()
    for xmlname in xmls:
        fontdict = preprocessor.getFontDictionary(
            ET.parse(xmlloc + xmlname + ".xml")
        )  #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>)
        annotatedxml = trainer.readAnnotatedXml(annotatedxmlloc + xmlname +
                                                "_annotated")
        annotatedxmllist.append([annotatedxml, fontdict])

    CRFImpl.domaintrain(annotatedxmllist)
    print CRFImpl.trainedweights
    f = open("TrainedWeightsCRF", 'w')
    for weight in CRFImpl.trainedweights:
        f.write(str(weight) + "\n")

    f.close()
def TestUsingCRF(predictxmlname, location, TDsvm = None):
    CRF = getModelwithTrainedWeights()
    fontdict = preprocessor.getFontDictionary(ET.parse(location + predictxmlname + ".xml"))                  
    preprocessedxml = preprocessor.preprocessxml(location + predictxmlname + ".xml") #list(pages), pages -> list(cols), col -> list(<Sparse/NonSparse, tag>)
    
    alltables = list()
    errorcount = 0
    sparseerror = 0
    ntlafterpostproc = 0
    for page in preprocessedxml:
        for col in page:
            if(len(col) < 2):
                    continue
            for tup in col:
                if(tup[1].text is None or tup[1].text.strip() == ''):
                    col.remove(tup)
            for lineno in xrange(len(col)):
                col[lineno].append(lineno)
            
            result = CRF.predict(col, fontdict)
            predicted = result[0]
            errorcount += result[1]
            sparseerror += result[2]
#            for r in predicted:
#                if(r[0] == SparseType.OTHERSPARSE):
#                    print r[1].text.encode('ascii','ignore') + " *** Line no *** " + str(r[2])
            data = postprocessor.findTables(predicted)
            tables = data
            if(len(tables) == 0):
                continue
            for t in tables:
                alltables.append(t)
                
    if TDsvm is None:
        for table in alltables:
            print "============================================="
            for row in table:
                if(int(row[0]) == SparseType.NONTABLELINE):
                    ntlafterpostproc += 1
                print row[1].text.encode('ascii','ignore') 
            print "=============================================="
        
    else:
        for t in alltables:
            predicted = TDsvm.domainpredictforTableDecomposition(t)
            print "=============================================="
            for r in predicted[0]:
                if(r[0] == SparseType.HEADER):
                    print r[1].text + "  ---> HEADER "
                else:
                    print r[1].text + "  ---> DATA "
            print "=============================================="
            
    return [errorcount, sparseerror, ntlafterpostproc]
def getModelwithTrainedWeights(isCRF=True):
    trainedweights = list()
    if (isCRF):
        f = open("TrainedWeightsCRF", "r")
        for weight in f:
            trainedweights.append(float(weight))

        f.close()
        CRFImpl = CRF(trainedweights)
        return CRFImpl
    else:
        f = open("TrainedWeightsLR", "r")
        for weight in f:
            trainedweights.append(float(weight))

        f.close()
        LR = LogisticRegressor(trainedweights)
        return LR