def file(files, extractor, configurator, output=None): """ Roda uma extrator com as configuracoes especificadas para um conjunto de arquivos @param extractor: Extrator to extract information from dom tree @param configurator: Configurator to provide the information to run benchmark @param output: std to print output messagens """ import urllib parser = ParseDom() marker = configurator.marker() metric = configurator.metric() proof = configurator.proof() print "Doc\tPre\tRec\tlext\tlpro\tfile_name" for id, filePath in enumerate(files): fileName = filePath.split('/')[-1] marker.reset() if filePath[0:4] == "http": htmlString = urllib.urlopen(filePath).read() else: htmlString = open(filePath, 'r').read() dom = parser.parse(htmlString) p = proof.getProof(dom) r = extractor.process(dom, marker) v = 0 t = len(p) if marker.labels.has_key('table'): v = len(marker.labels['table']) if t == 0: if v == 0: x = (1,1) else: x = (0,1) elif v > 0: x = metric.process(marker.labels['table'], p) precision = x[0] recall = x[1] print "%d\t%.02f\t%0.2f\t%d\t%d\t%s" % \ (id+1, precision, recall, v, t, fileName) if output: out = open('out/%d.html' % (id+1), 'w') print >>out, r else: pass
# print 'find productlist' self.labels['productlist'].append(node) elif node.hasAttribute('proof_product') and \ node.getAttribute('proof_product') == 'true': self.labels['product'].append(node) for child in node.childNodes: self.dfs(child) def getProof(self, dom): """ Get a set of nodes """ self.__resetLabels() self.dfs(dom) return self.labels if __name__ == "__main__": from eri.utils.parsedom import ParseDom import sys if len(sys.argv) < 2: print "erro" else: p = ParseDom() d = p.parse(sys.argv[1]) # c = CeProof() # c.getProof(d)
if __name__ == '__main__': from eri.utils.parsedom import ParseDom from eri.marker import Marker if len(sys.argv) < 2: raise SystemExit, "use: %s <URI> [output_file]" % sys.argv[0] filePath = sys.argv[1] if len(sys.argv) > 2: out = open(sys.argv[2], 'w') else: out = sys.stdout if filePath[0:4] == "http": import urllib htmlString = urllib.urlopen(filePath).read() else: htmlString = open(filePath, 'r').read() marker = Marker() parser = ParseDom() dom = parser.parse(htmlString) extractor = Coloring() result = extractor.process(dom, marker) print >> out, result
if __name__ == '__main__': from eri.utils.parsedom import ParseDom from eri.markercoloring import MarkerColoring as Marker if len(sys.argv) < 2: raise SystemExit, "use: %s <URI> [output_file]" % sys.argv[0] filePath = sys.argv[1] if len(sys.argv) > 2: out = open(sys.argv[2], 'w') else: out = sys.stdout if filePath[0:4] == "http": import urllib htmlString = urllib.urlopen(filePath).read() else: htmlString = open(filePath, 'r').read() marker = Marker() parser = ParseDom() dom = parser.parse(filePath) extractor = Table() result = extractor.process(dom, marker) print >> out, result