def nodedist(self,n1,n2): an1 = n1 an2 = n2 anchor = [n1,n2] ancestor = [[],[]] for i in range(2): while 'predict' not in anchor[i].attrib: ancestor[i].append(anchor[i]) anchor[i] = anchor[i].getparent() ancestor[i].append(anchor[i]) ancestor[i].reverse() if ancestor[0][0] == ancestor[1][0]: same_line_seperate = 1 else: same_line_seperate = 0 i = 0 for j in range(min(len(ancestor[0]),len(ancestor[1]))): if ancestor[0][j].tag != ancestor[1][j].tag: break else: i = j totalsize = int(ancestor[0][0].attrib['size']) + int(ancestor[1][0].attrib['size']) coversize = int(ancestor[0][i].attrib['size']) + int(ancestor[1][i].attrib['size']) subsize = int(n1.attrib['size']) + int(n2.attrib['size']) local_position_dist = scipy.sqrt((coversize-subsize)/(totalsize-subsize+1)) local_down_dist = findBackbone.tree_edit_distance(n1,n2)[0]/subsize left_diff = int(n1.attrib['left'])-int(n2.attrib['left']) width_diff = int(n1.attrib['width']) - int(n2.attrib['width']) maxleft = max(int(n1.attrib['left']), int(n2.attrib['left'])) minright = min(int(n1.attrib['left'])+int(n1.attrib['width']), int(n2.attrib['left'])+int(n2.attrib['width'])) overlap = 2*(minright - maxleft)/(int(n1.attrib['width']) + int(n2.attrib['width'])) #nop_dist = scipy.exp(1-overlap) - 1 nop_dist = 1-overlap right_diff = left_diff + width_diff center_diff = (left_diff+right_diff)/2 height_diff = int(n1.attrib['height']) - int(n2.attrib['height']) geometry_dist = left_diff**2 + right_diff**2 + center_diff**2 + width_diff**2 + height_diff**2 #format_diff = int(n1.attrib['isTime']) - int(n2.attrib['isTime']) dc = Detector.DistComposer() dc.local_position_dist = local_position_dist dc.local_down_dist = local_down_dist dc.nop_dist = nop_dist dc.left_diff = abs(left_diff) dc.same_line_seperate = same_line_seperate dc.width_ratio = max(int(n1.attrib['width']), int(n2.attrib['width']))/min(int(n1.attrib['width']), int(n2.attrib['width']))-1 return dc
def locateTitleLine(xmldir,max_runner_up,threshold): filelist = glob.glob(path.join(xmldir,'*.xml')) parser = etree.XMLParser(recover=True) truepositive = 0 falsepositive = 0 truenegative = 0 falsenegative = 0 num_err_fn = 0 num_err_fp = 0 num_err = 0 num_total = 0 num_backsufficient = 0 errfids = {} num = 0 for file in filelist: #file = '../data/2ndxml1117/data/12013.txt.xml' tree = etree.parse(file,parser=parser) listregion = tree.find('//*[@label="%s"]' % LABEL['LIST']) if num % 200 == 0 : print(num/len(filelist)) num += 1 if listregion is None: continue lines = findBackbone.findPrinciple(listregion,max_runner_up,threshold) truelines = listregion.findall('*[@label="%s"]' % LABEL['TITLE_LINE']) tp = 0 fp = 0 try: backbone = findBackbone.findBackbone(lines) except Exception: print('Error:',file) else: if collectLabel(backbone) == collectLabel(listregion): num_backsufficient += 1 for line in listregion: dist,ops = findBackbone.tree_edit_distance(backbone,line) acts = {a[0] for a in ops} if 'd' not in acts: if line.attrib['label'] == LABEL['TITLE_LINE']: tp += 1 else: fp += 1 else: print(file) errfids[tree.getroot().attrib['fid']] = errfids.get(tree.getroot().attrib['fid'],0) + 1 positive = len(truelines) fn = positive - tp tn = len(listregion) - (tp+fp) - fn num_total += 1 falsepositive += fp truepositive += tp truenegative += tn falsenegative += fn #print(etree.tostring(backbone,encoding='utf-8',pretty_print=True).decode('utf-8')) #break print('true positive:', truepositive) print('false positive:', falsepositive) print('true negative:', truenegative) print('false negative:', falsenegative) print('num of list false positive:', num_err_fp) print('num of list false negative:', num_err_fn) print('num of list with err:', num_err) print('total numer of list:', num_total) print('num of sufficent backbone:',num_backsufficient) print('num of err sites:',len(errfids)) print('err num in each site:',errfids)