def analyse(self, xmlfilelist): parser = etree.XMLParser(recover=True) total_tp = 0 total_fp = 0 total_fn = 0 fnfid = {} fpfid = {} cerrs = dict.fromkeys(['AUTHOR', 'COUNT_READ', 'COUNT_REPLY', 'TITLE', 'TM_POST', 'TM_REPLY'],0) cerrfiles = {} num = 0 for xmlfile in xmlfilelist: #print(xmlfile) if num % 200 == 0 : print(num/len(xmlfilelist)) print('REV.150') print('total_tp:', total_tp) print('total_fp:', total_fp) print('total_fn:', total_fn) print('fnfid:', fnfid) print('fpfid:', fpfid) print('cerrs:',cerrs) print('cerrfiles:', cerrfiles) num += 1 root = etree.parse(xmlfile,parser).getroot() htmlnode = root[0] se = StructureExtractor() se.drawFeature(htmlnode) extractor = Extractor(htmlnode) extractor.process() self.detector.detect(htmlnode) (tp, fp, fn) = self.calcAccTitleLine(htmlnode) total_tp += tp total_fp += fp total_fn += fn if fn == 0: cerr = self.calcAccColumn(htmlnode) for k in cerr: cerrs[k] += 1 if len(cerr)>0: cerrfiles[path.basename(xmlfile)] = (cerr, root.attrib['fid']) if fp > 0: #print(xmlfile) fpfid[root.attrib['fid']] = fpfid.get(root.attrib['fid'],0) + 1 if fn > 0: print(xmlfile) fnfid[root.attrib['fid']] = fnfid.get(root.attrib['fid'],0) + 1 print('REV.150') print('total_tp:', total_tp) print('total_fp:', total_fp) print('total_fn:', total_fn) print('fnfid:', fnfid) print('fpfid:', fpfid) print('cerrs:',cerrs) print('cerrfiles:', cerrfiles)
def process(self,root): se = StructureExtractor() se.drawFeature(root) self.extractor = Extractor(root) self.extractor.process() #self.crossP = self.extractor.string2sparse(self.htmlnode.attrib['crossP'],self.extractor.totalheight+1) Config.init() detector = Detector()#Config.nbTLstr) detector.detect(root) self.toolbox.setDetector(detector)
def performAnalyse(self, htmlnode): se = StructureExtractor() se.drawFeature(htmlnode) extractor = Extractor(htmlnode) extractor.process() detector = Detector() detector.detect(htmlnode) (tp, fp, fn) = self.calcAccTitleLine(htmlnode) cerr = self.calcAccColumn(htmlnode) return (tp, fp, fn, cerr)
def train(self, xmlfile_list): parser = etree.XMLParser(recover=True) # model for title line confirmation, using naive bayes ptagid = [{},{}] ppos = [[0,0],[0,0]] plen = [[],[]] psize = [[],[]] pleft = [[],[]] pwidth = [[],[]] pheight = [[],[]] for xmlfile in xmlfile_list: print(xmlfile) root = etree.parse(xmlfile,parser).getroot() htmlnode = root[0] se = StructureExtractor() se.drawFeature(htmlnode) extractor = Extractor(htmlnode) extractor.process() self.detect(htmlnode) for node in htmlnode.findall('.//*[@predict="{}"]'.format(LABEL['TITLE_LINE'])): if 'label' in node.attrib and node.attrib['label'] == LABEL['TITLE_LINE']: # tagid feature: discrete ptagid[1][node.tag] = ptagid[1].get(node.tag,0) + 1 # position: first or not first if node.attrib['position'] == '0': ppos[1][0] += 1 else: ppos[1][1] += 1 # length: Gaussian plen[1].append(int(node.attrib['length'])) # size: Gaussian psize[1].append(int(node.attrib['size'])) # left: Gaussian pleft[1].append(int(node.attrib['left'])) # width: Gaussian pwidth[1].append(int(node.attrib['width'])) # height: Gaussian pheight[1].append(int(node.attrib['height'])) else: ptagid[0][node.tag] = ptagid[0].get(node.tag,0) + 1 if node.attrib['position'] == '0': ppos[0][0] += 1 else: ppos[0][1] += 1 plen[0].append(int(node.attrib['length'])) psize[0].append(int(node.attrib['size'])) pleft[0].append(int(node.attrib['left'])) pwidth[0].append(int(node.attrib['width'])) pheight[0].append(int(node.attrib['height'])) # tagid feature: discrete tsum = sum([ptagid[0][k] for k in ptagid[0]]) self.mtagid = [] self.mtagid.append({k:ptagid[0][k]/tsum for k in ptagid[0]}) tsum = sum([ptagid[1][k] for k in ptagid[1]]) self.mtagid.append({k:ptagid[1][k]/tsum for k in ptagid[1]}) # position: first or not first self.mpos = [[ppos[0][0]/sum(ppos[0]), ppos[0][1]/sum(ppos[0])], [ppos[1][0]/sum(ppos[1]), ppos[1][1]/sum(ppos[1])]] # length: Gaussian arr0 = scipy.array(plen[0]) arr1 = scipy.array(plen[1]) self.mlen=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] # size: Gaussian arr0 = scipy.array(psize[0]) arr1 = scipy.array(psize[1]) self.msize=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] # left: Gaussian arr0 = scipy.array(pleft[0]) arr1 = scipy.array(pleft[1]) self.mleft=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] # width: Gaussian arr0 = scipy.array(pwidth[0]) arr1 = scipy.array(pwidth[1]) self.mwidth=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] # height: Gaussian arr0 = scipy.array(pheight[0]) arr1 = scipy.array(pheight[1]) self.mheight=[[arr0.mean(),arr0.std()],[arr1.mean(),arr1.std()]] return (self.mtagid, self.mpos, self.mlen, self.msize, self.mleft, self.mwidth, self.mheight)