def generate_ids(infile, outfile): from csv import DictReader, DictWriter from simpleslugger import make_slug with open(infile, "r") as f: reader = DictReader(f) fieldnames = [fieldname.decode("utf-8") for fieldname in reader.fieldnames] with open(outfile, "w") as g: writer = DictWriter(g, fieldnames=fieldnames) # write the first line (field names) writer.writerow(dict((fn, fn) for fn in fieldnames)) for row in reader: row[u"id"] = u"se/%s" % make_slug(row[u"name"].decode("utf-8")) writer.writerow(dict((fieldname, row[fieldname]) for fieldname in fieldnames))
def generate_ids(infile, outfile): from csv import DictReader, DictWriter from simpleslugger import make_slug with open(infile, "r") as f: reader = DictReader(f) fieldnames = [ fieldname.decode("utf-8") for fieldname in reader.fieldnames ] with open(outfile, "w") as g: writer = DictWriter(g, fieldnames=fieldnames) # write the first line (field names) writer.writerow(dict((fn, fn) for fn in fieldnames)) for row in reader: row[u"id"] = u"se/%s" % make_slug(row[u"name"].decode("utf-8")) writer.writerow( dict((fieldname, row[fieldname]) for fieldname in fieldnames))
def readBody(self, elem): "Reads information on a public body." cadastro = elem.xpath("Dados_Cadastro")[0] body = {} for col, elem_name in column_map.items(): value = cadastro.xpath("%s/text()" % elem_name) if value: body[col] = unicode(value[0]) body["slug"] = make_slug(body["name"]) # # TODO: set category # body["category"] = tipo e natureza juridica # the following is disabled for now as it's too long # body["description"] = elem.xpath("Competencia/Descricao")[0] body["jurisdiction_code"] = u"BR" body["id"] = u"%s/%s" % (body["jurisdiction_code"].lower(), body["slug"]) body["source_url"] = u"http://repositorio.dados.gov.br/governo-politica/administracao-publica/estrutura-organizacional/" body_type_code = u"".join(cadastro.xpath("Codigo_Tipo_Orgao/text()")) body_type_description = self.domain.body_types.get(body_type_code, None) if body_type_description: body["classification"] = body_type_description legal_entity_type_code = u"".join(cadastro.xpath("Codigo_Natureza_Juridica/text()")) legal_entity_type_description = self.domain.legal_entity_types.get(legal_entity_type_code, None) if legal_entity_type_description: # the schema doesn't have a legal entity type field, so we add to the tags body["tags"] = self.addTag(body, make_slug(legal_entity_type_description)) localidade = elem.xpath("Localidade")[0] body["address"] = u", ".join( localidade.xpath("Descricao_Endereco/text()") + localidade.xpath("Descricao_Complemento/text()") + localidade.xpath("Nome_Cidade/text()") + localidade.xpath("Sigla_UF/text()") ) + u", Brasil" body["address"].replace(u",",u";") urlx = cadastro.xpath("Site/text()") if urlx: url = urlparse(unicode(urlx[0])) else: url = urlparse(u"") if url.geturl() == u"http://": body["url"] = u"" elif not url.netloc and url.geturl().strip(): # for bugged relative urls missing http:// body["url"] = u"http://" + url.geturl() else: body["url"] = url.geturl() if INCLUDE_EMAIL: body["email"] = u"".join(cadastro.xpath("Email/text()")) if INCLUDE_PHONE: import phonenumbers areacode = u"".join(cadastro.xpath("DDD/text()")) # phonenumbers phone number parsing library needs a placeholder carrier, this is discarded after parsing # dirty data sometimes use a single "0" for regional area codes, sometimes for international area codes if re.match(r"^0\d", areacode.strip()) and len(areacode.strip()) > 3: areacode = u"0021" + areacode.strip()[:1] else: # national area code areacode = u"021" + areacode.strip().replace(u"0xx", u"") # remove carrier selector placeholder localnumbers = u"".join(cadastro.xpath("Telefones/text()")) localnumbermatch = next(localnumberpattern.finditer(localnumbers), None) if localnumbermatch: localnumber = localnumbermatch.group() phonenumber = phonenumbers.parse(areacode + localnumber, "BR") body["contact"] = phonenumbers.format_number(phonenumber, phonenumbers.PhoneNumberFormat.INTERNATIONAL) if INCLUDE_DESCRIPTION: descricao = elem.xpath("Finalidade/Descricao/text()") if not descricao: descricao = elem.xpath("Competencia/Descricao/text()") body["description"] = bulletpointpattern.subn(u" * ", paragraphpattern.subn(u"", u"".join(descricao))[0])[0] baselegal = elem.xpath("Base_Legal")[0] body["founding_date"] = unicode(datetime.strptime(baselegal.xpath("Data/text()")[0], "%d-%m-%Y").date().isoformat()) #body["updated_at"] = self.global_metadata["generated_at"].isoformat() self.public_bodies.append(body) self.body_by_code[int(cadastro.xpath("Codigo/text()")[0])] = body
def readBody(self, elem): "Reads information on a public body." cadastro = elem.xpath("Dados_Cadastro")[0] body = {} for col, elem_name in column_map.items(): value = cadastro.xpath("%s/text()" % elem_name) if value: body[col] = unicode(value[0]) body["slug"] = make_slug(body["name"]) # # TODO: set category # body["category"] = tipo e natureza juridica # the following is disabled for now as it's too long # body["description"] = elem.xpath("Competencia/Descricao")[0] body["jurisdiction_code"] = u"BR" body["id"] = u"%s/%s" % (body["jurisdiction_code"].lower(), body["slug"]) body[ "source_url"] = u"http://repositorio.dados.gov.br/governo-politica/administracao-publica/estrutura-organizacional/" body_type_code = u"".join(cadastro.xpath("Codigo_Tipo_Orgao/text()")) body_type_description = self.domain.body_types.get( body_type_code, None) if body_type_description: body["classification"] = body_type_description legal_entity_type_code = u"".join( cadastro.xpath("Codigo_Natureza_Juridica/text()")) legal_entity_type_description = self.domain.legal_entity_types.get( legal_entity_type_code, None) if legal_entity_type_description: # the schema doesn't have a legal entity type field, so we add to the tags body["tags"] = self.addTag( body, make_slug(legal_entity_type_description)) localidade = elem.xpath("Localidade")[0] body["address"] = u", ".join( localidade.xpath("Descricao_Endereco/text()") + localidade.xpath("Descricao_Complemento/text()") + localidade.xpath("Nome_Cidade/text()") + localidade.xpath("Sigla_UF/text()")) + u", Brasil" body["address"].replace(u",", u";") urlx = cadastro.xpath("Site/text()") if urlx: url = urlparse(unicode(urlx[0])) else: url = urlparse(u"") if url.geturl() == u"http://": body["url"] = u"" elif not url.netloc and url.geturl().strip(): # for bugged relative urls missing http:// body["url"] = u"http://" + url.geturl() else: body["url"] = url.geturl() if INCLUDE_EMAIL: body["email"] = u"".join(cadastro.xpath("Email/text()")) if INCLUDE_PHONE: import phonenumbers areacode = u"".join(cadastro.xpath("DDD/text()")) # phonenumbers phone number parsing library needs a placeholder carrier, this is discarded after parsing # dirty data sometimes use a single "0" for regional area codes, sometimes for international area codes if re.match(r"^0\d", areacode.strip()) and len(areacode.strip()) > 3: areacode = u"0021" + areacode.strip()[:1] else: # national area code areacode = u"021" + areacode.strip().replace( u"0xx", u"") # remove carrier selector placeholder localnumbers = u"".join(cadastro.xpath("Telefones/text()")) localnumbermatch = next(localnumberpattern.finditer(localnumbers), None) if localnumbermatch: localnumber = localnumbermatch.group() phonenumber = phonenumbers.parse(areacode + localnumber, "BR") body["contact"] = phonenumbers.format_number( phonenumber, phonenumbers.PhoneNumberFormat.INTERNATIONAL) if INCLUDE_DESCRIPTION: descricao = elem.xpath("Finalidade/Descricao/text()") if not descricao: descricao = elem.xpath("Competencia/Descricao/text()") body["description"] = bulletpointpattern.subn( u" * ", paragraphpattern.subn(u"", u"".join(descricao))[0])[0] baselegal = elem.xpath("Base_Legal")[0] body["founding_date"] = unicode( datetime.strptime(baselegal.xpath("Data/text()")[0], "%d-%m-%Y").date().isoformat()) #body["updated_at"] = self.global_metadata["generated_at"].isoformat() self.public_bodies.append(body) self.body_by_code[int(cadastro.xpath("Codigo/text()")[0])] = body