def num_votes_from_tr(tr): # there may be 2 <td> with class "vot", but only one won't be empty vot_nums = tr.xpath('./td[contains(@class, "vot") and ' \ 'not(contains(@class, "pvot"))]/text()') # Python lists need Ruby's compact method. vot_nums = [y for y in (digits_only(c(x)) for x in vot_nums if not '%' in vot_nums) if y] assert len(vot_nums) == 1 return int(vot_nums[0])
def preprocess(self, job): assert (200 <= job.response.status < 300) out = job.meta['preprocess'] = {} out['results'] = [] doc = html5lib.parse(job.data, treebuilder='lxml', namespaceHTMLElements=False) html = doc.getroot() ### RESULTS PARSING tvotos = html.xpath('.//table[@id="TVOTOS"]')[0] # we skip the first <tr> since it's the title row # the rest of <tr> are repeated structures like this one. # AGRUP 1: (a) tr # AGRUP 1: (a) th.sigla/text() -- agrupación nombre # AGRUP 1: [OPT] (b) tr.agrupa # AGRUP 1: [OPT] (b) th.[agrupa,sigla]/text() -- agrupación lista # AGRUP 1: [OPT] (c) tr.agrupa # AGRUP 1: [OPT] (c) th.sigla/text() -- agrupación formula rows = tvotos.xpath('.//tr')[1:] def num_votes_from_tr(tr): # there may be 2 <td> with class "vot", but only one won't be empty vot_nums = tr.xpath('./td[contains(@class, "vot") and ' \ 'not(contains(@class, "pvot"))]/text()') # Python lists need Ruby's compact method. vot_nums = [y for y in (digits_only(c(x)) for x in vot_nums if not '%' in vot_nums) if y] assert len(vot_nums) == 1 return int(vot_nums[0]) oddity = lambda tr: 'r1' in tr.attrib['class'] for trs in flip_flop(rows, oddity): bigrow = {} for tr in trs: if not bigrow: # (a) agrupacion nombre th = tr.xpath('./th[1]')[0] bigrow['agrupacion'] = {'id': c(th.attrib['id']), 'nombre': c(th.text), 'votos': num_votes_from_tr(tr)} else: th = tr.xpath('./th[1]')[0] if 'agrupa' in th.attrib['class']: # (b) agrupacion lista aglist = {'id': c(th.attrib['id']), 'nombre': c(th.text), 'votos': num_votes_from_tr(tr)} if not 'listas' in bigrow: bigrow['listas'] = [] bigrow['listas'].append(aglist) else: # (c) agrupacion formula bigrow['formula'] = { 'id': c(th.attrib['id']), 'nombre': c(th.text) } out['results'].append(bigrow) ### MESAS PARSING table = html.xpath('.//div[@class="pt1"]/table[@class="tablin"]')[0] mesas_total = int(digits_only(table.xpath( './/th[contains(.,"Totales")]/following-sibling::*')[0].text)) mesas_escrutadas = int(digits_only(table.xpath( './/th[contains(.,"Escrutadas")]/following-sibling::*')[0].text)) out['mesas'] = { 'total': mesas_total, 'escrutadas': mesas_escrutadas } ### ELECTORES PARSING table = html.xpath('.//div[@class="pt2"]/table[@class="tablin"]')[0] electores_total = int(digits_only(table.xpath( './/th[contains(., "Totales")]/following-sibling::*')[0].text)) electores_votantes = int(digits_only(table.xpath( './/th[contains(., "Votantes")]/following-sibling::*')[0].text)) out['electores'] = { 'total': electores_total, 'votantes': electores_votantes } out['parsed_timestamp'] = time.time()