# -*- coding: latin1 -*- """ A simple script to get a list of links in a site, in this case 'http://www.cms.ba.gov.br/vereadores.aspx', execute each and get some data in its respective pages Um script simples para obter uma lista de links em um site, neste caso 'http://www.cms.ba.gov.br/vereadores.aspx', execute cada um e obter alguns dados em suas respectivas páginas """ from webcapture import WebCapture wb = WebCapture() wb.accessUrl("http://www.cms.ba.gov.br/vereadores.aspx") wb.creatList("Vereadores") for tag in wb.getTagList("#meio_conteudo .foto_vereador_int", "a"): wb.accessUrl("http://www.cms.ba.gov.br/" + tag.attrib["href"]) wb.creatObject() wb.capture("nome", ".nome_vereador") wb.capture("cargo", ".cargo_ver") wb.capture("partido", ".partido_vereador") wb.capture("telefone", ".telefones_vereador") wb.capture("email", ".email_vereador") wb.closeObject() wb.save('json') wb.save('xml')
wb.capture("aniversario", "#content ul li:nth-child(2)", format = r".*: (\d*) / (\d*) .*", filter = "\\1/\\2") wb.capture("profissao", "#content ul li:nth-child(2)", format = r".*: .*: (.*)", filter = "\\1") wb.capture("partido", "#content ul li:nth-child(3)", format = r".*: (.*?) / (.*?) / (.*)", filter = "\\1") wb.capture("UF", "#content ul li:nth-child(3)", format = r".*: (.*?) / (.*?) / (.*)", filter = "\\2") wb.capture("diplomacao", "#content ul li:nth-child(3)", format = r".*: (.*?) / (.*?) / (.*)", filter = "\\3") wb.capture("telefone", "#content ul li:nth-child(4)", format = r".*: (\(\d*?\)) ([\d-]*) .*", filter = "\\1 \\2") wb.capture("fax", "#content ul li:nth-child(4)", format = r".*: (\(\d*?\)) ([\d-]*) - Fax: ([\d-]*)", filter = "\\1 \\3") wb.capture("legislaturas", "#content ul li:nth-child(5)", find = r"\d{2}/\d{2}") img = wb.getTagList("#content .clearedBox", "img")[0] #wb.capture(""#content .clearedBox img[0][src]" wb.put("fotolink", img.attrib["src"]) wb.closeObject() wb.save('json') wb.save('xml')