Python StructureExtractor 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: drawFeature

클래스/타입: StructureExtractor

hotexamples.com에서의 예제들: 4

Python StructureExtractor - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 drawFeature.StructureExtractor에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

drawFeature(4)

예제 #1

파일 보기

파일: Analyser.py 프로젝트: cheungzq/WebParser

    def analyse(self, xmlfilelist):
        parser = etree.XMLParser(recover=True)
        total_tp = 0
        total_fp = 0
        total_fn = 0
        fnfid = {}
        fpfid = {}
        cerrs = dict.fromkeys(['AUTHOR', 'COUNT_READ', 'COUNT_REPLY', 'TITLE', 'TM_POST', 'TM_REPLY'],0)
        cerrfiles = {}
        num = 0
        for xmlfile in xmlfilelist:
            #print(xmlfile)
            if num % 200 == 0 :
                print(num/len(xmlfilelist))
                print('REV.150')
                print('total_tp:', total_tp)
                print('total_fp:', total_fp)
                print('total_fn:', total_fn)
                print('fnfid:', fnfid)
                print('fpfid:', fpfid)

                print('cerrs:',cerrs)
                print('cerrfiles:', cerrfiles)
            num += 1
            root = etree.parse(xmlfile,parser).getroot()
            htmlnode = root[0]
            se = StructureExtractor()
            se.drawFeature(htmlnode)
            extractor = Extractor(htmlnode)
            extractor.process()
            self.detector.detect(htmlnode)
            (tp, fp, fn) = self.calcAccTitleLine(htmlnode)
            total_tp += tp
            total_fp += fp
            total_fn += fn
            if fn == 0:
                cerr = self.calcAccColumn(htmlnode)
                for k in cerr:
                    cerrs[k] += 1
                if len(cerr)>0:
                    cerrfiles[path.basename(xmlfile)] = (cerr, root.attrib['fid'])
            if fp > 0:
                #print(xmlfile)
                fpfid[root.attrib['fid']] = fpfid.get(root.attrib['fid'],0) + 1
            if fn > 0:
                print(xmlfile)
                fnfid[root.attrib['fid']] = fnfid.get(root.attrib['fid'],0) + 1
        print('REV.150')
        print('total_tp:', total_tp)
        print('total_fp:', total_fp)
        print('total_fn:', total_fn)
        print('fnfid:', fnfid)
        print('fpfid:', fpfid)

        print('cerrs:',cerrs)
        print('cerrfiles:', cerrfiles)

예제 #2

파일 보기

파일: visualizer.py 프로젝트: cheungzq/WebParser

 def process(self,root):
     se = StructureExtractor()
     se.drawFeature(root)
     self.extractor = Extractor(root)
     self.extractor.process()
     #self.crossP = self.extractor.string2sparse(self.htmlnode.attrib['crossP'],self.extractor.totalheight+1)
     Config.init()
     detector = Detector()#Config.nbTLstr)
     detector.detect(root)
     self.toolbox.setDetector(detector)

예제 #3

파일 보기

파일: Analyser.py 프로젝트: cheungzq/WebParser

 def performAnalyse(self, htmlnode):
     se = StructureExtractor()
     se.drawFeature(htmlnode)
     extractor = Extractor(htmlnode)
     extractor.process()
     detector = Detector()
     detector.detect(htmlnode)
     (tp, fp, fn) = self.calcAccTitleLine(htmlnode)
     cerr = self.calcAccColumn(htmlnode)
     return (tp, fp, fn, cerr)

예제 #4

파일 보기

파일: Detector.py 프로젝트: cheungzq/WebParser

    def train(self, xmlfile_list):
        parser = etree.XMLParser(recover=True)
        # model for title line confirmation, using naive bayes
        ptagid = [{},{}]
        ppos = [[0,0],[0,0]]
        plen = [[],[]]
        psize = [[],[]]
        pleft = [[],[]]
        pwidth = [[],[]]
        pheight = [[],[]]

        for xmlfile in xmlfile_list:
            print(xmlfile)
            root = etree.parse(xmlfile,parser).getroot()
            htmlnode = root[0]
            se = StructureExtractor()
            se.drawFeature(htmlnode)
            extractor = Extractor(htmlnode)
            extractor.process()
            self.detect(htmlnode)
            
            for node in htmlnode.findall('.//*[@predict="{}"]'.format(LABEL['TITLE_LINE'])):                
                if 'label' in node.attrib and node.attrib['label'] == LABEL['TITLE_LINE']:
                    # tagid feature: discrete
                    ptagid[1][node.tag] = ptagid[1].get(node.tag,0) + 1
                    # position: first or not first
                    if node.attrib['position'] == '0':
                        ppos[1][0] += 1
                    else:
                        ppos[1][1] += 1
                    # length: Gaussian
                    plen[1].append(int(node.attrib['length']))
                    # size: Gaussian
                    psize[1].append(int(node.attrib['size']))
                    # left: Gaussian
                    pleft[1].append(int(node.attrib['left']))
                    # width: Gaussian
                    pwidth[1].append(int(node.attrib['width']))
                    # height: Gaussian
                    pheight[1].append(int(node.attrib['height']))
                else:
                    ptagid[0][node.tag] = ptagid[0].get(node.tag,0) + 1
                    if node.attrib['position'] == '0':
                        ppos[0][0] += 1
                    else:
                        ppos[0][1] += 1
                    plen[0].append(int(node.attrib['length']))
                    psize[0].append(int(node.attrib['size']))
                    pleft[0].append(int(node.attrib['left']))
                    pwidth[0].append(int(node.attrib['width']))
                    pheight[0].append(int(node.attrib['height']))
        # tagid feature: discrete
        tsum = sum([ptagid[0][k] for k in ptagid[0]])
        self.mtagid = []
        self.mtagid.append({k:ptagid[0][k]/tsum for k in ptagid[0]})
        tsum = sum([ptagid[1][k] for k in ptagid[1]])
        self.mtagid.append({k:ptagid[1][k]/tsum for k in ptagid[1]})

        # position: first or not first
        self.mpos = [[ppos[0][0]/sum(ppos[0]), ppos[0][1]/sum(ppos[0])], [ppos[1][0]/sum(ppos[1]), ppos[1][1]/sum(ppos[1])]]

        # length: Gaussian
        arr0 = scipy.array(plen[0])
        arr1 = scipy.array(plen[1])
        self.mlen=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # size: Gaussian
        arr0 = scipy.array(psize[0])
        arr1 = scipy.array(psize[1])
        self.msize=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]
        
        # left: Gaussian
        arr0 = scipy.array(pleft[0])
        arr1 = scipy.array(pleft[1])
        self.mleft=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # width: Gaussian
        arr0 = scipy.array(pwidth[0])
        arr1 = scipy.array(pwidth[1])
        self.mwidth=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        # height: Gaussian
        arr0 = scipy.array(pheight[0])
        arr1 = scipy.array(pheight[1])
        self.mheight=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]]

        return (self.mtagid, self.mpos, self.mlen, self.msize, self.mleft, self.mwidth, self.mheight)