예제 #1
0
def generate_ids(infile, outfile):
    from csv import DictReader, DictWriter
    from simpleslugger import make_slug

    with open(infile, "r") as f:
        reader = DictReader(f)
        fieldnames = [fieldname.decode("utf-8") for fieldname in reader.fieldnames]

        with open(outfile, "w") as g:
            writer = DictWriter(g, fieldnames=fieldnames)

            # write the first line (field names)
            writer.writerow(dict((fn, fn) for fn in fieldnames))

            for row in reader:
                row[u"id"] = u"se/%s" % make_slug(row[u"name"].decode("utf-8"))
                writer.writerow(dict((fieldname, row[fieldname]) for fieldname in fieldnames))
예제 #2
0
def generate_ids(infile, outfile):
    from csv import DictReader, DictWriter
    from simpleslugger import make_slug

    with open(infile, "r") as f:
        reader = DictReader(f)
        fieldnames = [
            fieldname.decode("utf-8") for fieldname in reader.fieldnames
        ]

        with open(outfile, "w") as g:
            writer = DictWriter(g, fieldnames=fieldnames)

            # write the first line (field names)
            writer.writerow(dict((fn, fn) for fn in fieldnames))

            for row in reader:
                row[u"id"] = u"se/%s" % make_slug(row[u"name"].decode("utf-8"))
                writer.writerow(
                    dict((fieldname, row[fieldname])
                         for fieldname in fieldnames))
예제 #3
0
 def readBody(self, elem):
     "Reads information on a public body."
     cadastro = elem.xpath("Dados_Cadastro")[0]
     body = {}
     for col, elem_name in column_map.items():
         value = cadastro.xpath("%s/text()" % elem_name)
         if value:
             body[col] = unicode(value[0])
     body["slug"] = make_slug(body["name"])
     # # TODO: set category
     # body["category"] = tipo e natureza juridica
     # the following is disabled for now as it's too long
     # body["description"] = elem.xpath("Competencia/Descricao")[0]
     body["jurisdiction_code"] = u"BR"
     body["id"] = u"%s/%s" % (body["jurisdiction_code"].lower(), body["slug"])
     body["source_url"] = u"http://repositorio.dados.gov.br/governo-politica/administracao-publica/estrutura-organizacional/"
     body_type_code = u"".join(cadastro.xpath("Codigo_Tipo_Orgao/text()"))
     body_type_description = self.domain.body_types.get(body_type_code, None)
     if body_type_description:
         body["classification"] = body_type_description
     legal_entity_type_code = u"".join(cadastro.xpath("Codigo_Natureza_Juridica/text()"))
     legal_entity_type_description = self.domain.legal_entity_types.get(legal_entity_type_code, None)
     if legal_entity_type_description:
         # the schema doesn't have a legal entity type field, so we add to the tags
         body["tags"] = self.addTag(body, make_slug(legal_entity_type_description))
     localidade = elem.xpath("Localidade")[0]
     body["address"] = u", ".join(
         localidade.xpath("Descricao_Endereco/text()") +
         localidade.xpath("Descricao_Complemento/text()") +
         localidade.xpath("Nome_Cidade/text()") +
         localidade.xpath("Sigla_UF/text()")
         ) + u", Brasil"
     body["address"].replace(u",",u";")
     urlx = cadastro.xpath("Site/text()")
     if urlx:
         url = urlparse(unicode(urlx[0]))
     else:
         url = urlparse(u"")
     if url.geturl() == u"http://":
         body["url"] = u""
     elif not url.netloc and url.geturl().strip():
         # for bugged relative urls missing http://
         body["url"] = u"http://" + url.geturl()
     else:
         body["url"] = url.geturl()
     if INCLUDE_EMAIL:
         body["email"] = u"".join(cadastro.xpath("Email/text()"))
     if INCLUDE_PHONE:
         import phonenumbers
         areacode = u"".join(cadastro.xpath("DDD/text()"))
         # phonenumbers phone number parsing library needs a placeholder carrier, this is discarded after parsing
         # dirty data sometimes use a single "0" for regional area codes, sometimes for international area codes
         if re.match(r"^0\d", areacode.strip()) and len(areacode.strip()) > 3:
             areacode = u"0021" + areacode.strip()[:1]
         else: # national area code
             areacode = u"021" + areacode.strip().replace(u"0xx", u"") # remove carrier selector placeholder
         localnumbers = u"".join(cadastro.xpath("Telefones/text()"))
         localnumbermatch = next(localnumberpattern.finditer(localnumbers), None)
         if localnumbermatch:
             localnumber = localnumbermatch.group()
             phonenumber = phonenumbers.parse(areacode + localnumber, "BR")
             body["contact"] = phonenumbers.format_number(phonenumber, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
     if INCLUDE_DESCRIPTION:
         descricao = elem.xpath("Finalidade/Descricao/text()")
         if not descricao:
             descricao = elem.xpath("Competencia/Descricao/text()")
         body["description"] = bulletpointpattern.subn(u" * ", paragraphpattern.subn(u"", u"".join(descricao))[0])[0]
     baselegal = elem.xpath("Base_Legal")[0]
     body["founding_date"] = unicode(datetime.strptime(baselegal.xpath("Data/text()")[0], "%d-%m-%Y").date().isoformat())
     #body["updated_at"] = self.global_metadata["generated_at"].isoformat()
     self.public_bodies.append(body)
     self.body_by_code[int(cadastro.xpath("Codigo/text()")[0])] = body
예제 #4
0
 def readBody(self, elem):
     "Reads information on a public body."
     cadastro = elem.xpath("Dados_Cadastro")[0]
     body = {}
     for col, elem_name in column_map.items():
         value = cadastro.xpath("%s/text()" % elem_name)
         if value:
             body[col] = unicode(value[0])
     body["slug"] = make_slug(body["name"])
     # # TODO: set category
     # body["category"] = tipo e natureza juridica
     # the following is disabled for now as it's too long
     # body["description"] = elem.xpath("Competencia/Descricao")[0]
     body["jurisdiction_code"] = u"BR"
     body["id"] = u"%s/%s" % (body["jurisdiction_code"].lower(),
                              body["slug"])
     body[
         "source_url"] = u"http://repositorio.dados.gov.br/governo-politica/administracao-publica/estrutura-organizacional/"
     body_type_code = u"".join(cadastro.xpath("Codigo_Tipo_Orgao/text()"))
     body_type_description = self.domain.body_types.get(
         body_type_code, None)
     if body_type_description:
         body["classification"] = body_type_description
     legal_entity_type_code = u"".join(
         cadastro.xpath("Codigo_Natureza_Juridica/text()"))
     legal_entity_type_description = self.domain.legal_entity_types.get(
         legal_entity_type_code, None)
     if legal_entity_type_description:
         # the schema doesn't have a legal entity type field, so we add to the tags
         body["tags"] = self.addTag(
             body, make_slug(legal_entity_type_description))
     localidade = elem.xpath("Localidade")[0]
     body["address"] = u", ".join(
         localidade.xpath("Descricao_Endereco/text()") +
         localidade.xpath("Descricao_Complemento/text()") +
         localidade.xpath("Nome_Cidade/text()") +
         localidade.xpath("Sigla_UF/text()")) + u", Brasil"
     body["address"].replace(u",", u";")
     urlx = cadastro.xpath("Site/text()")
     if urlx:
         url = urlparse(unicode(urlx[0]))
     else:
         url = urlparse(u"")
     if url.geturl() == u"http://":
         body["url"] = u""
     elif not url.netloc and url.geturl().strip():
         # for bugged relative urls missing http://
         body["url"] = u"http://" + url.geturl()
     else:
         body["url"] = url.geturl()
     if INCLUDE_EMAIL:
         body["email"] = u"".join(cadastro.xpath("Email/text()"))
     if INCLUDE_PHONE:
         import phonenumbers
         areacode = u"".join(cadastro.xpath("DDD/text()"))
         # phonenumbers phone number parsing library needs a placeholder carrier, this is discarded after parsing
         # dirty data sometimes use a single "0" for regional area codes, sometimes for international area codes
         if re.match(r"^0\d",
                     areacode.strip()) and len(areacode.strip()) > 3:
             areacode = u"0021" + areacode.strip()[:1]
         else:  # national area code
             areacode = u"021" + areacode.strip().replace(
                 u"0xx", u"")  # remove carrier selector placeholder
         localnumbers = u"".join(cadastro.xpath("Telefones/text()"))
         localnumbermatch = next(localnumberpattern.finditer(localnumbers),
                                 None)
         if localnumbermatch:
             localnumber = localnumbermatch.group()
             phonenumber = phonenumbers.parse(areacode + localnumber, "BR")
             body["contact"] = phonenumbers.format_number(
                 phonenumber, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
     if INCLUDE_DESCRIPTION:
         descricao = elem.xpath("Finalidade/Descricao/text()")
         if not descricao:
             descricao = elem.xpath("Competencia/Descricao/text()")
         body["description"] = bulletpointpattern.subn(
             u" * ",
             paragraphpattern.subn(u"", u"".join(descricao))[0])[0]
     baselegal = elem.xpath("Base_Legal")[0]
     body["founding_date"] = unicode(
         datetime.strptime(baselegal.xpath("Data/text()")[0],
                           "%d-%m-%Y").date().isoformat())
     #body["updated_at"] = self.global_metadata["generated_at"].isoformat()
     self.public_bodies.append(body)
     self.body_by_code[int(cadastro.xpath("Codigo/text()")[0])] = body