예제 #1
0
# -*- coding: latin1 -*-

"""
A simple script to get a list of links in a site, in this case 'http://www.cms.ba.gov.br/vereadores.aspx', execute each and get some data in its respective pages

Um script simples para obter uma lista de links em um site, neste caso 'http://www.cms.ba.gov.br/vereadores.aspx', execute cada um e obter alguns dados em suas respectivas páginas
"""

from webcapture import WebCapture

wb = WebCapture()

wb.accessUrl("http://www.cms.ba.gov.br/vereadores.aspx")
wb.creatList("Vereadores")

for tag in wb.getTagList("#meio_conteudo .foto_vereador_int", "a"):

	wb.accessUrl("http://www.cms.ba.gov.br/" + tag.attrib["href"])

	wb.creatObject()
	wb.capture("nome", ".nome_vereador")
	wb.capture("cargo", ".cargo_ver")
	wb.capture("partido", ".partido_vereador")
	wb.capture("telefone", ".telefones_vereador")
	wb.capture("email", ".email_vereador")
	wb.closeObject()

wb.save('json')
wb.save('xml')
예제 #2
0
	wb.capture("aniversario", "#content ul li:nth-child(2)",
		format = r".*: (\d*) / (\d*) .*", filter = "\\1/\\2")

	wb.capture("profissao", "#content ul li:nth-child(2)",
		format = r".*: .*: (.*)", filter = "\\1")

	wb.capture("partido", "#content ul li:nth-child(3)",
		format = r".*: (.*?) / (.*?) / (.*)", filter = "\\1")

	wb.capture("UF", "#content ul li:nth-child(3)",
		format = r".*: (.*?) / (.*?) / (.*)", filter = "\\2")

	wb.capture("diplomacao", "#content ul li:nth-child(3)",
		format = r".*: (.*?) / (.*?) / (.*)", filter = "\\3")

	wb.capture("telefone", "#content ul li:nth-child(4)",
		format = r".*: (\(\d*?\)) ([\d-]*) .*", filter = "\\1 \\2")

	wb.capture("fax", "#content ul li:nth-child(4)",
		format = r".*: (\(\d*?\)) ([\d-]*) - Fax: ([\d-]*)", filter = "\\1 \\3")

	wb.capture("legislaturas", "#content ul li:nth-child(5)",
		find = r"\d{2}/\d{2}")

	img = wb.getTagList("#content .clearedBox", "img")[0]  #wb.capture(""#content .clearedBox img[0][src]"
	wb.put("fotolink", img.attrib["src"])

	wb.closeObject()

wb.save('json')
wb.save('xml')