Exemplo n.º 1
0
    def handle(self, args, other):

        name = prompt('jurisdiction name (e.g. City of Seattle): ')
        division = prompt('division id (look this up in the opencivicdata/ocd-division-ids '
                          'repository): ')
        classification = prompt('classification (can be: {}): '
                                .format(', '.join(JURISDICTION_CLASSIFICATIONS)))
        url = prompt('official URL: ')

        try:
            Division.get(division)
        except ValueError:
            print("\nERROR: Division ID is invalid.",
                  "Please find the correct division_id here",
                  "https://github.com/opencivicdata/ocd-division-ids/tree/master/identifiers",
                  "or contact [email protected] to add a new country\n")
            sys.exit(1)

        if os.path.exists(args.module):
            raise CommandError(args.module + ' directory already exists')
        os.makedirs(args.module)

        # will default to True until they pick one, then defaults to False
        scraper_types = CLASS_DICT.keys()
        selected_scraper_types = []
        for stype in scraper_types:
            prompt_str = 'create {} scraper? {}: '.format(
                stype, '[y/N]' if selected_scraper_types else '[Y/n]')
            default = 'N' if selected_scraper_types else 'Y'
            result = prompt(prompt_str, default).upper()
            if result == 'Y':
                selected_scraper_types.append(stype)

        write_jurisdiction_template(args.module, args.module, name, division, classification, url,
                                    selected_scraper_types)
Exemplo n.º 2
0
    def get_organizations(self):
        organization = Organization(self.name,
                                    classification=self.classification)

        division_names = {
            'Beaches—East York',
            'Davenport',
            'Don Valley East',
            'Don Valley North',
            'Don Valley West',
            'Eglinton—Lawrence',
            'Etobicoke Centre',
            'Etobicoke North',
            'Etobicoke—Lakeshore',
            'Humber River—Black Creek',
            'Parkdale—High Park',
            'Scarborough Centre',
            'Scarborough North',
            'Scarborough Southwest',
            'Scarborough—Agincourt',
            'Scarborough—Guildwood',
            'Scarborough—Rouge Park',
            'Spadina—Fort York',
            'Toronto Centre',
            'Toronto—Danforth',
            'Toronto—St. Paul\'s',
            'University—Rosedale',
            'Willowdale',
            'York Centre',
            'York South—Weston',
        }

        division = Division.get(self.division_id)
        organization.add_post(role='candidate',
                              label=division.name,
                              division_id=division.id)

        for division in Division.get(
                'ocd-division/country:ca/province:on').children('ed'):
            if not self.skip_null_valid_from and not division.attrs.get(
                    'validFrom') or division.attrs.get('validFrom') and (
                        division.attrs['validFrom'] <=
                        datetime.now().strftime('%Y-%m-%d')
                        or division.attrs['validFrom'] == self.valid_from):
                if division.name in division_names:
                    organization.add_post(role='candidate',
                                          label=division.name,
                                          division_id=division.id)

        yield organization
Exemplo n.º 3
0
    def get_organizations(self):
        organization = Organization(self.name, classification=self.classification)

        leader_role = styles_of_address[self.division_id]['Leader']
        member_role = self.member_role or styles_of_address[self.division_id]['Member']

        parent = Division.get(self.division_id)
        # Don't yield posts for premiers.
        if parent._type not in ('province', 'territory'):
            # Yield posts to allow ca_on_toronto to make changes.
            post = Post(role=leader_role, label=parent.name, division_id=parent.id, organization_id=organization._id)
            yield post

        children = [child for child in parent.children() if child._type != 'place' and child._type not in self.exclude_types]

        for child in children:
            if not self.skip_null_valid_from and not child.attrs.get('validFrom') or child.attrs.get('validFrom') and (child.attrs['validFrom'] <= datetime.now().strftime('%Y-%m-%d') or child.attrs['validFrom'] == self.valid_from):
                if self.use_type_id:
                    label = child.id.rsplit('/', 1)[1].capitalize().replace(':', ' ')
                else:
                    label = child.name
                # Yield posts to allow ca_on_toronto to make changes.
                post = Post(role=member_role, label=label, division_id=child.id, organization_id=organization._id)
                yield post

        if not children and parent.attrs['posts_count']:
            for i in range(1, int(parent.attrs['posts_count'])):  # exclude Mayor
                organization.add_post(role=member_role, label='{} (seat {})'.format(parent.name, i), division_id=parent.id)

        yield organization
def load_divisions(country):
    existing_divisions = Division.objects.filter(country=country).count()

    country = FileDivision.get('ocd-division/country:' + country)
    objects = [to_db(country)]

    for child in country.children(levels=100):
        objects.append(to_db(child))

    print(len(objects), 'divisions loaded from CSV and ', existing_divisions, 'already in DB')

    if len(objects) == existing_divisions:
        print('no work to be done!')
    else:
        # delete old ids and add new ones all at once
        with transaction.atomic():
            Division.objects.filter(country=country).delete()
            for object_ in objects:
                object_.save()
            # Division.objects.bulk_create(objects, batch_size=10000)
            # XXX: The above line (bulk_create) ends up causing pk consistency
            #      issues when doing an update, even though we did a delete
            #      and the PK is clear. The .save() on each shockingly
            #      works. I'm switching this until this is entirely
            #      understood.
        print(len(objects), 'divisions created')
def load_divisions(country, bulk=False):
    existing_divisions = Division.objects.filter(country=country)

    country_division = FileDivision.get(
        "ocd-division/country:{}".format(country))
    objects = [to_db(country_division)]

    for child in country_division.children(levels=100):
        objects.append(to_db(child))

    print("{} divisions found in the CSV, and {} already in the DB".format(
        len(objects), existing_divisions.count()))

    if set(objects) == set(existing_divisions):
        print(
            "The CSV and the DB contents are exactly the same; no work to be done!"
        )
    else:
        if bulk:
            # delete old ids and add new ones all at once
            with transaction.atomic():
                Division.objects.filter(country=country).delete()
                Division.objects.bulk_create(objects, batch_size=10000)
            print("{} divisions created".format(len(objects)))
        else:
            to_create = set(objects) - set(existing_divisions)
            to_delete = set(existing_divisions) - set(objects)
            # delete removed ids and add new ones all at once
            with transaction.atomic():
                for division in to_delete:
                    division.delete()
                for division in to_create:
                    division.save()
            print("{} divisions deleted".format(len(to_delete)))
            print("{} divisions created".format(len(to_create)))
Exemplo n.º 6
0
def province_or_territory_abbreviation(code):
    if not province_or_territory_abbreviation_memo:
        for division in Division.all('ca', from_csv=ocd_division_csv):
            if division._type in ('province', 'territory'):
                province_or_territory_abbreviation_memo[
                    division.attrs['sgc']] = type_id(division.id).upper()
    return province_or_territory_abbreviation_memo[type_id(code)[:2]]
Exemplo n.º 7
0
    def get_organizations(self):
        exclude_type_ids = getattr(self, 'exclude_type_ids', [])
        use_type_id = getattr(self, 'use_type_id', False)

        organization = Organization(self.name, classification=self.classification)

        parent = Division.get(self.division_id)
        if parent._type not in ('province', 'territory'):
            post = Post(role=styles_of_address[self.division_id]['Leader'], label=parent.name, division_id=parent.id, organization_id=organization._id)
            yield post

        children = [child for child in parent.children() if child._type != 'place' and child._type not in exclude_type_ids]

        for child in children:
            if child:
                if use_type_id:
                    label = child.id.rsplit('/', 1)[1].capitalize().replace(':', ' ')
                else:
                    label = child.name
                post = Post(role=styles_of_address[self.division_id]['Member'], label=label, division_id=child.id, organization_id=organization._id)
                yield post

        if not children and parent.attrs['posts_count']:
            for i in range(1, int(parent.attrs['posts_count'])):  # exclude Mayor
                organization.add_post(role=styles_of_address[self.division_id]['Member'], label='{} (seat {})'.format(parent.name, i), division_id=parent.id)

        yield organization
Exemplo n.º 8
0
def province_and_territory_codes():
    if not province_and_territory_codes_memo:
        for division in Division.all('ca'):
            if division._type in ('province', 'territory'):
                province_and_territory_codes_memo[
                    division.attrs['sgc']] = division.id
    return province_and_territory_codes_memo
Exemplo n.º 9
0
def load_divisions(country):
    existing_divisions = Division.objects.filter(country=country).count()

    country = FileDivision.get('ocd-division/country:' + country)
    objects = [to_db(country)]

    for child in country.children(levels=100):
        objects.append(to_db(child))

    print(len(objects), 'divisions loaded from CSV and ', existing_divisions,
          'already in DB')

    if len(objects) == existing_divisions:
        print('no work to be done!')
    else:
        # delete old ids and add new ones all at once
        with transaction.atomic():
            Division.objects.filter(country=country).delete()
            for object_ in objects:
                object_.save()
            # Division.objects.bulk_create(objects, batch_size=10000)
            # XXX: The above line (bulk_create) ends up causing pk consistency
            #      issues when doing an update, even though we did a delete
            #      and the PK is clear. The .save() on each shockingly
            #      works. I'm switching this until this is entirely
            #      understood.
        print(len(objects), 'divisions created')
Exemplo n.º 10
0
    def get_organizations(self):
        exclude_type_ids = getattr(self, 'exclude_type_ids', [])
        use_type_id = getattr(self, 'use_type_id', False)

        organization = Organization(self.name, classification=self.classification)

        parent = Division.get(self.division_id)
        if parent._type not in ('province', 'territory'):
            post = Post(role=styles_of_address[self.division_id]['Leader'], label=parent.name, division_id=parent.id, organization_id=organization._id)
            yield post

        children = [child for child in parent.children() if child._type != 'place' and child._type not in exclude_type_ids]

        for child in children:
            if child:
                if use_type_id:
                    label = child.id.rsplit('/', 1)[1].capitalize().replace(':', ' ')
                else:
                    label = child.name
                post = Post(role=styles_of_address[self.division_id]['Member'], label=label, division_id=child.id, organization_id=organization._id)
                yield post

        if not children and parent.attrs['posts_count']:
            for i in range(1, int(parent.attrs['posts_count'])):  # exclude Mayor
                organization.add_post(role=styles_of_address[self.division_id]['Member'], label='{} (seat {})'.format(parent.name, i), division_id=parent.id)

        yield organization
def load_divisions(country):
    existing_divisions = Division.objects.filter(country=country)

    country = FileDivision.get('ocd-division/country:{}'.format(country))
    objects = [to_db(country)]

    for child in country.children(levels=100):
        objects.append(to_db(child))

    print('{} divisions found in the CSV, and {} already in the DB'.format(
        len(objects), existing_divisions.count()))

    if set(objects) == set(existing_divisions):
        print(
            'The CSV and the DB contents are exactly the same; no work to be done!'
        )
    else:
        # delete old ids and add new ones all at once
        with transaction.atomic():
            Division.objects.filter(country=country).delete()
            for object_ in objects:
                object_.save()
            # Division.objects.bulk_create(objects, batch_size=10000)
            # XXX: The above line (bulk_create) ends up causing pk consistency
            #      issues when doing an update, even though we did a delete
            #      and the PK is clear. The .save() on each shockingly
            #      works. I'm switching this until this is entirely
            #      understood.
        print('{} divisions created'.format(len(objects)))
Exemplo n.º 12
0
    def get_organizations(self):
        organization = Organization(self.name, classification=self.classification)

        leader_role = styles_of_address[self.division_id]['Leader']
        member_role = self.member_role or styles_of_address[self.division_id]['Member']

        parent = Division.get(self.division_id)
        # Don't yield posts for premiers.
        if parent._type not in ('province', 'territory'):
            # Yield posts to allow ca_on_toronto to make changes.
            post = Post(role=leader_role, label=parent.name, division_id=parent.id, organization_id=organization._id)
            yield post

        children = [child for child in parent.children() if child._type != 'place' and child._type not in self.exclude_types]

        for child in children:
            if not self.skip_null_valid_from and not child.attrs.get('validFrom') or child.attrs.get('validFrom') and (child.attrs['validFrom'] <= datetime.now().strftime('%Y-%m-%d') or child.attrs['validFrom'] == self.valid_from):
                if self.use_type_id:
                    label = child.id.rsplit('/', 1)[1].capitalize().replace(':', ' ')
                else:
                    label = child.name
                # Yield posts to allow ca_on_toronto to make changes.
                post = Post(role=member_role, label=label, division_id=child.id, organization_id=organization._id)
                yield post

        if not children and parent.attrs['posts_count']:
            for i in range(1, int(parent.attrs['posts_count'])):  # exclude Mayor
                organization.add_post(role=member_role, label='{} (seat {})'.format(parent.name, i), division_id=parent.id)

        yield organization
Exemplo n.º 13
0
    def get_organizations(self):
        organization = Organization(self.name, classification=self.classification)

        for division in Division.get(self.division_id).children('ed'):
            if division.attrs['validFrom'] == '2019-09-10':
                organization.add_post(role='candidate', label=division.name)

        yield organization
Exemplo n.º 14
0
    def get_organizations(self):
        organization = Organization(self.name, classification=self.classification)

        organization.add_post(role='Mayor', label=self.division_name, division_id=self.division_id)
        for division in Division.get('ocd-division/country:ca/csd:3520005').children('ward'):
            if '2018' in division.id:
                organization.add_post(role='Councillor', label=division.name, division_id=division.id)

        yield organization
Exemplo n.º 15
0
def province_or_territory_abbreviations():
    if not province_or_territory_abbreviation_memo:
        province_or_territory_abbreviation_memo['PEI'] = 'PE'
        for division in Division.all('ca'):
            if division._type in ('province', 'territory'):
                abbreviation = division.id.rsplit(':', 1)[1].upper()
                province_or_territory_abbreviation_memo[division.name] = abbreviation
                province_or_territory_abbreviation_memo[division.attrs['name_fr']] = abbreviation
    return province_or_territory_abbreviation_memo
Exemplo n.º 16
0
def divisions_with_boroughs():
    """
    Returns the OCD identifiers for divisions with boroughs.
    """
    if not divisions_with_boroughs_memo:
        for division in Division.all('ca', from_csv=ocd_division_csv):
            if division._type == 'borough':
                divisions_with_boroughs_memo.add(division.parent.id)
    return divisions_with_boroughs_memo
Exemplo n.º 17
0
def province_or_territory_abbreviations():
    if not province_or_territory_abbreviation_memo:
        province_or_territory_abbreviation_memo['PEI'] = 'PE'
        for division in Division.all('ca'):
            if division._type in ('province', 'territory'):
                abbreviation = division.id.rsplit(':', 1)[1].upper()
                province_or_territory_abbreviation_memo[division.name] = abbreviation
                province_or_territory_abbreviation_memo[division.attrs['name_fr']] = abbreviation
    return province_or_territory_abbreviation_memo
Exemplo n.º 18
0
    def get_organizations(self):
        organization = Organization(self.name, classification=self.classification)

        organization.add_post(role='Maire de la Ville de Montréal', label=self.division_name, division_id=self.division_id)  # 0,00

        for division in Division.get(self.division_id).children('borough'):  # 18
            if division.id != 'ocd-division/country:ca/csd:2466023/borough:18':  # Maire de la Ville de Montréal is Maire d'arrondissement for Ville-Marie
                organization.add_post(role="Maire d'arrondissement", label=division.name, division_id=division.id)

        for division in Division.get(self.division_id).children('district'):  # 46
            borough_id = division.id.rsplit(':', 1)[1].split('.', 1)[0]
            if borough_id not in ('2', '4', '6', '9'):
                organization.add_post(role='Conseiller de la ville', label=division.name, division_id=division.id)

        organization.add_post(role='Conseiller de la ville', label='Anjou', division_id='ocd-division/country:ca/csd:2466023/borough:2')
        organization.add_post(role='Conseiller de la ville', label='Lachine', division_id='ocd-division/country:ca/csd:2466023/borough:4')

        yield organization
Exemplo n.º 19
0
def divisions_with_boroughs():
    """
    Returns the OCD identifiers for divisions with boroughs.
    """
    if not divisions_with_boroughs_memo:
        for division in Division.all('ca', from_csv=ocd_division_csv):
            if division._type == 'borough':
                divisions_with_boroughs_memo.add(division.parent.id)
    return divisions_with_boroughs_memo
Exemplo n.º 20
0
    def handle(self, args, other):
        if os.path.exists(args.module):
            raise CommandError('Directory {} already exists'.format(
                repr(args.module)))

        division = None
        while not division:
            division = prompt(
                'division id (see https://github.com/opencivicdata/'
                'ocd-division-ids/tree/master/identifiers): ')
            if not division:
                print("\nERROR: Division ID is required.\n")

        try:
            Division.get(division)
        except (ValueError, IndexError):
            raise CommandError('Division ID {} is invalid'.format(
                repr(division)))

        name = prompt('jurisdiction name (e.g. City of Seattle): ')
        classification = prompt('classification (can be: {}): '.format(
            ', '.join(JURISDICTION_CLASSIFICATIONS)))
        url = prompt('official url (e.g. http://www.seattle.gov/): ')

        os.makedirs(args.module)

        # Will default to True until they pick one, then defaults to False.
        selected_scraper_types = []
        for stype in CLASS_DICT.keys():
            if selected_scraper_types:
                default = 'N'
                hint = '[y/N]'
            else:
                default = 'Y'
                hint = '[Y/n]'
            result = prompt('create {} scraper? {}: '.format(stype, hint),
                            default).upper()
            if result == 'Y':
                selected_scraper_types.append(stype)

        write_jurisdiction_template(args.module, args.module, name, division,
                                    classification, url,
                                    selected_scraper_types)
Exemplo n.º 21
0
    def get_organizations(self):
        organization = Organization(self.name, classification=self.classification)

        organization.add_post(role='Maire', label=self.division_name, division_id=self.division_id)
        for division in Division.get(self.division_id).children('district'):
            if division.name == 'Greenfield Park':
                for seat_number in range(1, 4):
                    organization.add_post(role='Conseiller', label='{} (siège {})'.format(division.name, seat_number), division_id=division.id)
            else:
                organization.add_post(role='Conseiller', label=division.name, division_id=division.id)

        yield organization
Exemplo n.º 22
0
 def handle(self, *args, **options):
     ids = set(division.id for division in Division.all('ca'))
     for slug, data in settings.IMAGO_BOUNDARY_MAPPINGS.items():
         url = 'https://represent.opennorth.ca/boundaries/{}/?limit=0'.format(
             slug)
         for obj in requests.get(url).json()['objects']:
             if callable(data['boundary_key']):
                 expected = data['prefix'] + data['boundary_key'](obj)
             else:
                 expected = data['prefix'] + obj[data['boundary_key']]
             if expected not in ids:
                 log.warn('No match for {} from {}'.format(expected, url))
Exemplo n.º 23
0
    def get_organizations(self):
        organization = Organization(self.name, classification=self.classification)

        division_names = {
            'Beaches—East York',
            'Davenport',
            'Don Valley East',
            'Don Valley North',
            'Don Valley West',
            'Eglinton—Lawrence',
            'Etobicoke Centre',
            'Etobicoke North',
            'Etobicoke—Lakeshore',
            'Humber River—Black Creek',
            'Parkdale—High Park',
            'Scarborough Centre',
            'Scarborough North',
            'Scarborough Southwest',
            'Scarborough—Agincourt',
            'Scarborough—Guildwood',
            'Scarborough—Rouge Park',
            'Spadina—Fort York',
            'Toronto Centre',
            'Toronto—Danforth',
            'Toronto—St. Paul\'s',
            'University—Rosedale',
            'Willowdale',
            'York Centre',
            'York South—Weston',
        }

        division = Division.get(self.division_id)
        organization.add_post(role='candidate', label=division.name, division_id=division.id)

        for division in Division.get('ocd-division/country:ca/province:on').children('ed'):
            if not self.skip_null_valid_from and not division.attrs.get('validFrom') or division.attrs.get('validFrom') and (division.attrs['validFrom'] <= datetime.now().strftime('%Y-%m-%d') or division.attrs['validFrom'] == self.valid_from):
                if division.name in division_names:
                    organization.add_post(role='candidate', label=division.name, division_id=division.id)

        yield organization
Exemplo n.º 24
0
    def get_organizations(self):
        parliament = Organization(self.name, classification=self.classification)
        yield parliament

        upper = Organization('Senate', classification='upper', parent_id=parliament)
        lower = Organization('House of Commons', classification='lower', parent_id=parliament)

        for division in Division.get(self.division_id).children('ed'):
            if division.attrs.get('validFrom') and division.attrs['validFrom'] <= datetime.now().strftime('%Y-%m-%d'):
                lower.add_post(role='MP', label=division.name, division_id=division.id)

        yield upper
        yield lower
Exemplo n.º 25
0
    def get_organizations(self):
        organization = Organization(self.name,
                                    classification=self.classification)

        organization.add_post(role='candidate',
                              label=self.division_name,
                              division_id=self.division_id)
        for division in Division.get(self.division_id).children('ward'):
            organization.add_post(role='candidate',
                                  label=division.name,
                                  division_id=division.id)

        yield organization
Exemplo n.º 26
0
Arquivo: init.py Projeto: fgregg/pupa
    def handle(self, args, other):

        name = prompt('jurisdiction name (e.g. City of Seattle): ')
        division = prompt(
            'division id (look this up in the opencivicdata/ocd-division-ids '
            'repository): ')
        classification = prompt('classification (can be: {}): '.format(
            ', '.join(JURISDICTION_CLASSIFICATIONS)))
        url = prompt('official URL: ')

        try:
            Division.get(division)
        except ValueError:
            print(
                "\nERROR: Division ID is invalid.",
                "Please find the correct division_id here",
                "https://github.com/opencivicdata/ocd-division-ids/tree/master/identifiers",
                "or contact [email protected] to add a new country\n"
            )
            sys.exit(1)

        if os.path.exists(args.module):
            raise CommandError(args.module + ' directory already exists')
        os.makedirs(args.module)

        # will default to True until they pick one, then defaults to False
        scraper_types = CLASS_DICT.keys()
        selected_scraper_types = []
        for stype in scraper_types:
            prompt_str = 'create {} scraper? {}: '.format(
                stype, '[y/N]' if selected_scraper_types else '[Y/n]')
            default = 'N' if selected_scraper_types else 'Y'
            result = prompt(prompt_str, default).upper()
            if result == 'Y':
                selected_scraper_types.append(stype)

        write_jurisdiction_template(args.module, args.module, name, division,
                                    classification, url,
                                    selected_scraper_types)
Exemplo n.º 27
0
    def handle(self, args, other):
        if os.path.exists(args.module):
            raise CommandError('Directory {} already exists'.format(repr(args.module)))

        division = None
        while not division:
            division = prompt('division id (see https://github.com/opencivicdata/'
                              'ocd-division-ids/tree/master/identifiers): ')
            if not division:
                print("\nERROR: Division ID is required.\n")

        try:
            Division.get(division)
        except (ValueError, IndexError):
            raise CommandError('Division ID {} is invalid'.format(repr(division)))

        name = prompt('jurisdiction name (e.g. City of Seattle): ')
        classification = prompt('classification (can be: {}): '
                                .format(', '.join(JURISDICTION_CLASSIFICATIONS)))
        url = prompt('official url (e.g. http://www.seattle.gov/): ')

        os.makedirs(args.module)

        # Will default to True until they pick one, then defaults to False.
        selected_scraper_types = []
        for stype in CLASS_DICT.keys():
            if selected_scraper_types:
                default = 'N'
                hint = '[y/N]'
            else:
                default = 'Y'
                hint = '[Y/n]'
            result = prompt('create {} scraper? {}: '.format(stype, hint), default).upper()
            if result == 'Y':
                selected_scraper_types.append(stype)

        write_jurisdiction_template(args.module, args.module, name, division, classification, url,
                                    selected_scraper_types)
Exemplo n.º 28
0
def validate_spreadsheet(url, identifier_header, geographic_name_header):
    """
    Validates the identifiers, geographic names and geographic types in a spreadsheet.
    """
    sgc_to_id = {}

    for division in Division.all('ca', from_csv=ocd_division_csv):
        sgc_to_id[division.attrs['sgc']] = division.id

    reader = csv_dict_reader(url)
    for row in reader:
        identifier = row[identifier_header]

        if len(identifier) == 2:
            identifier = sgc_to_id[identifier]
        elif len(identifier) == 4:
            identifier = 'ocd-division/country:ca/cd:{}'.format(identifier)
        elif len(identifier) == 7:
            identifier = 'ocd-division/country:ca/csd:{}'.format(identifier)

        division = Division.get(identifier)
        if row[geographic_name_header] != division.name:
            print('{}: name: {} not {}'.format(identifier, division.name, row[geographic_name_header]))
Exemplo n.º 29
0
    def get_organizations(self):
        organization = Organization(self.name,
                                    classification=self.classification)

        organization.add_post(role='Mayor',
                              label=self.division_name,
                              division_id=self.division_id)
        for division in Division.get(
                'ocd-division/country:ca/csd:3520005').children('ward'):
            if '2018' in division.id:
                organization.add_post(role='Councillor',
                                      label=division.name,
                                      division_id=division.id)

        yield organization
Exemplo n.º 30
0
def validate_spreadsheet(url, identifier_header, geographic_name_header):
    """
    Validates the identifiers, geographic names and geographic types in a spreadsheet.
    """
    sgc_to_id = {}

    for division in Division.all('ca', from_csv=ocd_division_csv):
        sgc_to_id[division.attrs['sgc']] = division.id

    reader = csv_dict_reader(url)
    for row in reader:
        identifier = row[identifier_header]

        if len(identifier) == 2:
            identifier = sgc_to_id[identifier]
        elif len(identifier) == 4:
            identifier = 'ocd-division/country:ca/cd:{}'.format(identifier)
        elif len(identifier) == 7:
            identifier = 'ocd-division/country:ca/csd:{}'.format(identifier)

        division = Division.get(identifier)
        if row[geographic_name_header] != division.name:
            print('{}: name: {} not {}'.format(identifier, division.name,
                                               row[geographic_name_header]))
Exemplo n.º 31
0
def load_mapping(boundary_set_id,
                 key,
                 prefix,
                 boundary_key='external_id',
                 ignore=None,
                 quiet=False):
    if ignore:
        ignore = re.compile(ignore)
    ignored = 0
    geoid_mapping = {}

    division_geometries = []

    for div in Division.get('ocd-division/country:' +
                            settings.IMAGO_COUNTRY).children(levels=100):
        if div.attrs[key]:
            geoid_mapping[div.attrs[key]] = div.id
        else:
            geoid_mapping[div.id] = div.id

    print('processing', boundary_set_id)

    boundary_set = BoundarySet.objects.get(pk=boundary_set_id)
    if callable(boundary_key):
        fields = []
    else:
        fields = [boundary_key]
    for boundary in boundary_set.boundaries.values('id', 'name', *fields):
        if callable(boundary_key):
            boundary_property = boundary_key(boundary)
        else:
            boundary_property = boundary[boundary_key]
        ocd_id = geoid_mapping.get(prefix + boundary_property)
        if ocd_id:
            division_geometries.append(
                DivisionGeometry(division_id=ocd_id,
                                 boundary_id=boundary['id']))
        elif not ignore or not ignore.match(boundary['name']):
            if not quiet:
                print('unmatched external id', boundary['name'],
                      boundary_property)
        else:
            ignored += 1

    DivisionGeometry.objects.bulk_create(division_geometries)

    if ignored:
        print('ignored {} unmatched external ids'.format(ignored))
Exemplo n.º 32
0
    def handle(self, *args, **options):
        sys.path.append(os.path.abspath('scrapers'))

        args = list(args)
        threshold = args and int(args.pop(0))
        module_names = args or os.listdir('scrapers')

        urls = [
            # Provinces and territories
            'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=101&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-101.CSV',
            # Census subdivisions
            'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=701&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-701.CSV',
            # Census divisions
            'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV',
        ]

        populations = {}
        for url in urls:
            response = requests.get(url)
            response.encoding = 'ISO-8859-1'
            reader = csv.reader(StringIO(response.text))
            next(reader)  # title
            next(reader)  # headers
            for row in reader:
                if row:
                    populations[row[0]] = int(row[4] or 0)
                else:
                    break

        for module_name in module_names:
            if os.path.isdir(os.path.join('scrapers', module_name)) and module_name not in ('.git', '_cache', '_data', '__pycache__', 'disabled'):
                division_id = module_name_to_metadata(module_name)['division_id']
                try:
                    report = Report.objects.get(module=module_name)
                    if report.exception:
                        status = 'error'
                    else:
                        status = 'success'
                except Report.DoesNotExist:
                    status = 'unknown'

                sgc = Division.get(division_id).attrs['sgc'] or division_id.rsplit('/', 1)[-1].split(':', 1)[-1]
                if sgc == 'ca':
                    sgc = '01'

                population = populations.get(sgc, 0)
                if not threshold or population < threshold:
                    print('%-32s %-7s %8d' % (module_name, status, population))
Exemplo n.º 33
0
    def get_organizations(self):
        organization = Organization(self.name, classification='committee')
        organization.add_source(self.url)

        for division in Division.get(
                self.division_id).children('school_district'):
            organization.add_post(role='Chair',
                                  label=division.name,
                                  division_id=division.id)
            for i in range(0, 22):  # XXX made-up number
                organization.add_post(role='Trustee',
                                      label='{} (seat {})'.format(
                                          division.name, i),
                                      division_id=division.id)

        yield organization
Exemplo n.º 34
0
def load_mapping(boundary_set_id, key, prefix, boundary_key='external_id', ignore=None, quiet=False):

    if ignore:
        ignore = re.compile(ignore)
    ignored = 0
    geoid_mapping = {}

    division_geometries = []

    for div in Division.get('ocd-division/country:' + settings.IMAGO_COUNTRY).children(levels=100):
        if div.attrs[key]:
            geoid_mapping[div.attrs[key]] = div.id
        else:
            geoid_mapping[div.id] = div.id
    
    print('processing', boundary_set_id)
    
    boundary_set = BoundarySet.objects.get(pk=boundary_set_id)
    if callable(boundary_key):
        fields = []
    else:
        fields = [boundary_key]
    for boundary in boundary_set.boundaries.values('id', 'name', *fields):
        if callable(boundary_key):
            boundary_property = boundary_key(boundary)
        else:
            boundary_property = boundary[boundary_key]
        
        ocd_id = geoid_mapping.get(prefix + boundary_property)

        if ocd_id:
            division_geometries.append(DivisionGeometry(division_id=ocd_id,
                                                        boundary_id=boundary['id']))

        elif not ignore or not ignore.match(boundary['name']):
            if not quiet:
                print('unmatched external id', boundary['name'], boundary_property)
        else:
            ignored += 1

    DivisionGeometry.objects.bulk_create(division_geometries)

    if ignored:
        print('ignored {} unmatched external ids'.format(ignored))
Exemplo n.º 35
0
    def handle(self, *args, **options):
        sys.path.append(os.path.abspath('scrapers'))

        threshold = options['threshold']
        module_names = options['module'] or os.listdir('scrapers')

        # @see http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/index-eng.cfm
        urls = [
            # Provinces and territories
            'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=101&OFT=FULLCSV',
            # Census subdivisions
            'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=701&OFT=FULLCSV',
            # Census divisions
            'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV',
        ]

        populations = {}
        for url in urls:
            response = requests.get(url, verify=settings.SSL_VERIFY)
            response.encoding = 'iso-8859-1'
            reader = csv.DictReader(StringIO(response.text))
            for row in reader:
                if row:
                    populations[row['Geographic code']] = int(row['Population, 2016'] or 0)
                else:
                    break

        for module_name in module_names:
            if os.path.isfile(os.path.join('scrapers', module_name, '__init__.py')):
                division_id = module_name_to_metadata(module_name)['division_id']
                try:
                    report = Report.objects.get(module=module_name)
                    if report.exception:
                        status = 'error'
                    else:
                        status = 'success'
                except Report.DoesNotExist:
                    status = 'unknown'

                sgc = Division.get(division_id).attrs['sgc'] or division_id.rsplit(':', 1)[1]

                population = populations.get(sgc, 0)
                if not threshold or population < threshold:
                    print('{:<32} {:<7} {:8}'.format(module_name, status, population))
def load_divisions(country):
    existing_divisions = Division.objects.filter(country=country)

    country_division = FileDivision.get('ocd-division/country:{}'.format(country))
    objects = [to_db(country_division)]

    for child in country_division.children(levels=100):
        objects.append(to_db(child))

    print('{} divisions found in the CSV, and {} already in the DB'.
          format(len(objects), existing_divisions.count()))

    if set(objects) == set(existing_divisions):
        print('The CSV and the DB contents are exactly the same; no work to be done!')
    else:
        # delete old ids and add new ones all at once
        with transaction.atomic():
            Division.objects.filter(country=country).delete()
            Division.objects.bulk_create(objects, batch_size=10000)
        print('{} divisions created'.format(len(objects)))
Exemplo n.º 37
0
    def get_organizations(self):
        parliament = Organization(self.name,
                                  classification=self.classification)
        yield parliament

        upper = Organization('Senate',
                             classification='upper',
                             parent_id=parliament)
        lower = Organization('House of Commons',
                             classification='lower',
                             parent_id=parliament)

        for division in Division.get(self.division_id).children('ed'):
            if division.attrs.get('validFrom') and division.attrs[
                    'validFrom'] <= datetime.now().strftime('%Y-%m-%d'):
                lower.add_post(role='MP',
                               label=division.name,
                               division_id=division.id)

        yield upper
        yield lower
Exemplo n.º 38
0
def load_divisions(country):
    existing_divisions = Division.objects.filter(country=country)

    country_division = FileDivision.get(
        'ocd-division/country:{}'.format(country))
    objects = [to_db(country_division)]

    for child in country_division.children(levels=100):
        objects.append(to_db(child))

    print('{} divisions found in the CSV, and {} already in the DB'.format(
        len(objects), existing_divisions.count()))

    if set(objects) == set(existing_divisions):
        print(
            'The CSV and the DB contents are exactly the same; no work to be done!'
        )
    else:
        # delete old ids and add new ones all at once
        with transaction.atomic():
            Division.objects.filter(country=country).delete()
            Division.objects.bulk_create(objects, batch_size=10000)
        print('{} divisions created'.format(len(objects)))
Exemplo n.º 39
0
def get_definition(division_id, aggregation=False):
    """
    Returns the expected configuration for a given division.
    """
    if not ocdid_to_type_name_map:
        # Map census division type codes to names.
        census_division_type_names = {}
        document = lxml.html.fromstring(requests.get('https://www12.statcan.gc.ca/census-recensement/2016/ref/dict/tab/t1_4-eng.cfm').content)
        for text in document.xpath('//table//th[@headers]/text()'):
            code, name = text.split(' – ', 1)
            census_division_type_names[code] = name.split(' / ', 1)[0]

        # Map census subdivision type codes to names.
        census_subdivision_type_names = {}
        document = lxml.html.fromstring(requests.get('https://www12.statcan.gc.ca/census-recensement/2016/ref/dict/tab/t1_5-eng.cfm').content)
        for text in document.xpath('//table//th[@headers]/text()'):
            code, name = text.split(' – ', 1)  # non-breaking space
            census_subdivision_type_names[code] = name.split(' / ', 1)[0]

        # Map OCD identifiers to census types.
        for division in Division.all('ca', from_csv=ocd_division_csv):
            if division._type == 'cd':
                ocdid_to_type_name_map[division.id] = census_division_type_names[division.attrs['classification']]
            elif division._type == 'csd':
                ocdid_to_type_name_map[division.id] = census_subdivision_type_names[division.attrs['classification']]

    division = Division.get(division_id, from_csv=ocd_division_csv)
    ocd_type_id = type_id(division.id)

    expected = {}
    vowels = ('A', 'À', 'E', 'É', 'I', 'Î', 'O', 'Ô', 'U')

    # Determine the module name, name and classification.
    if division._type == 'country':
        expected['module_name'] = 'ca'
        expected['name'] = 'Parliament of Canada'

    elif division._type in ('province', 'territory'):
        pattern = 'ca_{}_municipalities' if aggregation else 'ca_{}'
        expected['module_name'] = pattern.format(ocd_type_id)
        if aggregation:
            expected['name'] = '{} Municipalities'.format(division.name)
        elif ocd_type_id in ('nl', 'ns'):
            expected['name'] = '{} House of Assembly'.format(division.name)
        elif ocd_type_id == 'qc':
            expected['name'] = 'Assemblée nationale du Québec'
        else:
            expected['name'] = 'Legislative Assembly of {}'.format(division.name)

    elif division._type == 'cd':
        expected['module_name'] = 'ca_{}_{}'.format(province_or_territory_abbreviation(division.id), slug(division.name))
        name_infix = ocdid_to_type_name_map[division.id]
        if name_infix == 'Regional municipality':
            name_infix = 'Regional'
        expected['name'] = '{} {} Council'.format(division.name, name_infix)

    elif division._type == 'csd':
        expected['module_name'] = 'ca_{}_{}'.format(province_or_territory_abbreviation(division.id), slug(division.name))
        if ocd_type_id[:2] == '24':
            if division.name[0] in vowels:
                expected['name'] = "Conseil municipal d'{}".format(division.name)
            else:
                expected['name'] = "Conseil municipal de {}".format(division.name)
        else:
            name_infix = ocdid_to_type_name_map[division.id]
            if name_infix in ('Municipality', 'Specialized municipality'):
                name_infix = 'Municipal'
            elif name_infix == 'District municipality':
                name_infix = 'District'
            elif name_infix == 'Regional municipality':
                name_infix = 'Regional'
            expected['name'] = '{} {} Council'.format(division.name, name_infix)

    elif division._type == 'arrondissement':
        expected['module_name'] = 'ca_{}_{}_{}'.format(province_or_territory_abbreviation(division.parent.id), slug(division.parent.name), slug(division.name))
        if division.name[0] in vowels:
            expected['name'] = "Conseil d'arrondissement d'{}".format(division.name)
        elif division.name[:3] == 'Le ':
            expected['name'] = "Conseil d'arrondissement du {}".format(division.name[3:])
        else:
            expected['name'] = "Conseil d'arrondissement de {}".format(division.name)

    else:
        raise Exception('{}: Unrecognized OCD type {}'.format(division.id, division._type))

    # Determine the class name.
    class_name_parts = re.split('[ -]', re.sub("[—–]", '-', re.sub("['.]", '', division.name)))
    expected['class_name'] = unidecode(str(''.join(word if re.match('[A-Z]', word) else word.capitalize() for word in class_name_parts)))
    if aggregation:
        expected['class_name'] += 'Municipalities'

    # Determine the url.
    expected['url'] = division.attrs['url']

    # Determine the division name.
    expected['division_name'] = division.name

    return expected
Exemplo n.º 40
0
def test_children():
    us = Division.get("ocd-division/country:ua")
    assert len(list(us.children("region", duplicates=False))) == 25
Exemplo n.º 41
0
def get_definition(division_id, path=None):
    """
    Returns the expected contents of a definition file.
    """
    config = {}

    division = Division.get(division_id, from_csv=ocd_division_csv)

    # Determine slug, domain and authority.
    name = division.name
    if not name:
        print('%-60s unknown name: check slug and domain manually' % division.id)

    if division._type == 'country':
        slug = 'Federal electoral districts'
        config['domain'] = name
        config['authority'] = ['Her Majesty the Queen in Right of Canada']

    elif division._type in ('province', 'territory'):
        slug = '%s electoral districts' % name
        config['domain'] = name
        config['authority'] = ['Her Majesty the Queen in Right of %s' % name]

    elif division._type in ('cd', 'csd'):
        province_or_territory_sgc_code = type_id(division.id)[:2]

        if province_or_territory_sgc_code == '24' and division.id in divisions_with_boroughs():
            slug = re.compile(r'\A%s (boroughs|districts)\Z' % name)
        elif province_or_territory_sgc_code == '12' and division.attrs['classification'] != 'T':
            slug = '%s districts' % name
        elif province_or_territory_sgc_code == '47' and division.attrs['classification'] != 'CY':
            slug = '%s divisions' % name
        elif province_or_territory_sgc_code == '48' and division.attrs['classification'] == 'MD':
            slug = '%s divisions' % name
        elif province_or_territory_sgc_code == '24':
            if division.id in quartiers:
                slug = '%s quartiers' % name
            else:
                slug = '%s districts' % name
        else:
            slug = '%s wards' % name

        config['domain'] = '%s, %s' % (name, province_or_territory_abbreviation(division.id))

        if province_or_territory_sgc_code == '12' and 'boundaries/ca_ns_districts/' in path:
            config['authority'] = ['Her Majesty the Queen in Right of Nova Scotia']
        elif province_or_territory_sgc_code == '13' and 'boundaries/ca_nb_wards/' in path:
            config['authority'] = ['Her Majesty the Queen in Right of New Brunswick']
        elif province_or_territory_sgc_code == '24' and 'boundaries/ca_qc_' in path:
            config['authority'] = ['Directeur général des élections du Québec']
        elif province_or_territory_sgc_code == '47' and division.attrs['classification'] != 'CY':
            config['authority'] = ['MuniSoft']
        elif division._type == 'csd':
            config['authority'] = authorities + [division.attrs['organization_name']]
        else:
            config['authority'] = ['']  # We have no expectation for the authority of a Census division

    elif division._type == 'borough':
        province_or_territory_sgc_code = type_id(division.parent.id)[:2]

        if name:
            slug = '%s districts' % name
            config['domain'] = '%s, %s, %s' % (name, division.parent.name, province_or_territory_abbreviation(division.parent.id))
        else:
            slug = None
            config['domain'] = None

        if province_or_territory_sgc_code == '24':
            config['authority'] = ['Directeur général des élections du Québec']
        else:
            config['authority'] = [division.parent.attrs['organization_name']]

    else:
        raise Exception('%s: Unrecognized OCD type %s' % (division.id, division._type))

    return (slug, config)
Exemplo n.º 42
0
def province_and_territory_codes():
    if not province_and_territory_codes_memo:
        for division in Division.all('ca'):
            if division._type in ('province', 'territory'):
                province_and_territory_codes_memo[division.attrs['sgc']] = division.id
    return province_and_territory_codes_memo
Exemplo n.º 43
0
def province_or_territory_abbreviation(code):
    if not province_or_territory_abbreviation_memo:
        for division in Division.all('ca', from_csv=ocd_division_csv):
            if division._type in ('province', 'territory'):
                province_or_territory_abbreviation_memo[division.attrs['sgc']] = type_id(division.id).upper()
    return province_or_territory_abbreviation_memo[type_id(code)[:2]]
Exemplo n.º 44
0
def get_definition(division_id, aggregation=False):
    if not ocdid_to_type_name_map:
        # Map census division type codes to names.
        census_division_type_names = {}
        document = lxml.html.fromstring(requests.get('https://www12.statcan.gc.ca/census-recensement/2011/ref/dict/table-tableau/table-tableau-4-eng.cfm').content)
        for abbr in document.xpath('//table/tbody/tr/th[1]/abbr'):
            census_division_type_names[abbr.text_content()] = re.sub(' /.+\Z', '', abbr.attrib['title'])

        # Map census subdivision type codes to names.
        census_subdivision_type_names = {}
        document = lxml.html.fromstring(requests.get('https://www12.statcan.gc.ca/census-recensement/2011/ref/dict/table-tableau/table-tableau-5-eng.cfm').content)
        for abbr in document.xpath('//table/tbody/tr/th[1]/abbr'):
            census_subdivision_type_names[abbr.text_content()] = re.sub(' /.+\Z', '', abbr.attrib['title'])

        # Map OCD identifiers to census types.
        for division in Division.all('ca'):
            if division._type == 'cd':
                ocdid_to_type_name_map[division.id] = census_division_type_names[division.attrs['classification']]
            elif division._type == 'csd':
                ocdid_to_type_name_map[division.id] = census_subdivision_type_names[division.attrs['classification']]

    codes = province_and_territory_codes()
    division = Division.get(division_id)

    expected = {}
    vowels = ('A', 'À', 'E', 'É', 'I', 'Î', 'O', 'Ô', 'U')

    sections = division_id.split('/')
    ocd_type, ocd_type_id = sections[-1].split(':')

    # Determine the module name, name and classification.
    if ocd_type == 'country':
        expected['module_name'] = 'ca'
        expected['name'] = 'Parliament of Canada'
    elif ocd_type in ('province', 'territory'):
        pattern = 'ca_{}_municipalities' if aggregation else 'ca_{}'
        expected['module_name'] = pattern.format(ocd_type_id)
        if aggregation:
            expected['name'] = '{} Municipalities'.format(division.name)
        elif ocd_type_id in ('nl', 'ns'):
            expected['name'] = '{} House of Assembly'.format(division.name)
        elif ocd_type_id == 'qc':
            expected['name'] = 'Assemblée nationale du Québec'
        else:
            expected['name'] = 'Legislative Assembly of {}'.format(division.name)
    elif ocd_type == 'cd':
        province_or_territory_type_id = codes[ocd_type_id[:2]].split(':')[-1]
        expected['module_name'] = 'ca_{}_{}'.format(province_or_territory_type_id, slug(division.name))
        name_infix = ocdid_to_type_name_map[division_id]
        if name_infix == 'Regional municipality':
            name_infix = 'Regional'
        expected['name'] = '{} {} Council'.format(division.name, name_infix)
    elif ocd_type == 'csd':
        province_or_territory_type_id = codes[ocd_type_id[:2]].split(':')[-1]
        expected['module_name'] = 'ca_{}_{}'.format(province_or_territory_type_id, slug(division.name))
        if ocd_type_id[:2] == '24':
            if division.name[0] in vowels:
                expected['name'] = "Conseil municipal d'{}".format(division.name)
            else:
                expected['name'] = "Conseil municipal de {}".format(division.name)
        else:
            name_infix = ocdid_to_type_name_map[division_id]
            if name_infix in ('Municipality', 'Specialized municipality'):
                name_infix = 'Municipal'
            elif name_infix == 'District municipality':
                name_infix = 'District'
            elif name_infix == 'Regional municipality':
                name_infix = 'Regional'
            expected['name'] = '{} {} Council'.format(division.name, name_infix)
    elif ocd_type == 'arrondissement':
        census_subdivision_type_id = sections[-2].split(':')[-1]
        province_or_territory_type_id = codes[census_subdivision_type_id[:2]].split(':')[-1]
        expected['module_name'] = 'ca_{}_{}_{}'.format(province_or_territory_type_id, slug(Division.get('/'.join(sections[:-1])).name), slug(division.name))
        if division.name[0] in vowels:
            expected['name'] = "Conseil d'arrondissement d'{}".format(division.name)
        elif division.name[:3] == 'Le ':
            expected['name'] = "Conseil d'arrondissement du {}".format(division.name[3:])
        else:
            expected['name'] = "Conseil d'arrondissement de {}".format(division.name)
    else:
        raise Exception('{}: Unrecognized OCD type {}'.format(division_id, ocd_type))

    # Determine the class name.
    class_name_parts = re.split('[ -]', re.sub("[—–]", '-', re.sub("['.]", '', division.name)))
    expected['class_name'] = unidecode(text_type(''.join(word if re.match('[A-Z]', word) else word.capitalize() for word in class_name_parts)))
    if aggregation:
        expected['class_name'] += 'Municipalities'

    # Determine the url.
    expected['url'] = division.attrs['url']

    # Determine the division name.
    expected['division_name'] = division.name

    return expected
Exemplo n.º 45
0
    def scrape(self):
        exclude_districts = {
            # Existing scrapers
            'Abbotsford',
            'Burnaby',
            'Coquitlam',
            'Kelowna',
            'Langley',
            'New Westminster',
            'Richmond',
            'Saanich',
            'Surrey',
            'Vancouver',
            'Victoria',
        }
        excluded_district_types = {
            'Regional District',
            'Indian Government District',
            'Islands Trust',
            'Mountain Resort Municipality',
        }
        division_corrections = {
            '100 Mile House': 'One Hundred Mile House',
        }

        processed_ids = set()
        exclude_divisions = {}
        processed_divisions = set()
        infixes = {
            'CY': 'City',
            'DM': 'District',
            'IGD': 'District',
            'IM': 'Municipal',
            'RGM': 'Regional',
            'T': 'Town',
            'VL': 'Village',
            'RDA': 'District',
        }
        organizations = {}
        # Create list mapping names to IDs.
        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children(
                'csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('59'):
                if division.attrs['classification'] == 'IRI':
                    continue
                if division.name in names_to_ids:
                    names_to_ids[division.name] = None
                else:
                    names_to_ids[division.name] = division.id

        # Scrape list of municpalities.
        list_page = self.lxmlize(LIST_PAGE)
        municipalities = list_page.xpath('//select[@name="lgid"]/option')
        assert len(municipalities), 'No municipalities found'

        # Iterate through each municipality.
        for municipality in municipalities:
            municipality_text = municipality.text
            municipal_type = municipality_text[municipality_text.find('(') +
                                               1:municipality_text.find(')')]
            if municipal_type in excluded_district_types:
                continue

            municipal_id = municipality.get('value')
            division_name = municipality_text.split(' (')[0]
            division_name = division_corrections.get(division_name,
                                                     division_name)

            record_url = LIST_PAGE + '?stext=&type=ss&lgid=' + municipal_id + '&agencyid=+'

            # If we have a municipal ID, process that municipality.
            if municipal_id and municipal_id.strip():
                # Get division ID from municipal name and filter out duplicates or unknowns.
                if division_name in exclude_districts or division_name in processed_divisions:
                    continue
                division_id = names_to_ids[division_name]
                if not isinstance(division_id, str):
                    continue
                if division_id in exclude_divisions:
                    continue
                if division_id in processed_ids:
                    raise Exception(
                        'unhandled collision: {}'.format(division_id))
                division = Division.get(division_id)
                processed_divisions.add(division_name)

                # Get division name and create org.
                division_name = division.name
                organization_name = '{} {} Council'.format(
                    division_name, infixes[division.attrs['classification']])
                if division_id not in processed_ids:
                    processed_ids.add(division_id)
                    organizations[division_id] = Organization(
                        name=organization_name, classification='government')
                    organizations[division_id].add_source(record_url)
                organization = organizations[division_id]
                organization.add_post(role='Mayor',
                                      label=division_name,
                                      division_id=division_id)
                organization.add_post(role='Councillor',
                                      label=division_name,
                                      division_id=division_id)

                # Load records for municipality.
                municipal_page = self.lxmlize(record_url)
                number_of_records_text = municipal_page.xpath(
                    '//main/h4/text()')
                number_of_records = int(
                    re.search(r'\d+', number_of_records_text[0]).group())

                # Collate mayor and councillor representatives on first page of records.
                leader_reps = municipal_page.xpath(
                    '//main/ol/li[contains(., "Mayor")][not(contains(., "Chief"))]'
                )
                councillor_reps = municipal_page.xpath(
                    '//main/ol/li[contains(., "Councillor")]')

                # Iterate through additional pages of records if they exists adding reps.
                if number_of_records > 10:
                    quotient, remainder = divmod(number_of_records, 10)
                    number_of_pages = quotient + int(bool(remainder))
                    for i in range(2, number_of_pages + 1):
                        municipal_page = self.lxmlize(record_url + '&pn=' +
                                                      str(i))
                        additional_leader_reps = municipal_page.xpath(
                            '//main/ol/li[contains(., "Mayor")][not(contains(., "Chief"))][not(contains(., "Assistant"))]'
                        )
                        leader_reps.extend(additional_leader_reps)
                        additional_councillor_reps = municipal_page.xpath(
                            '//main/ol/li[contains(., "Councillor")]')
                        councillor_reps.extend(additional_councillor_reps)

                # Create person records for all mayor and councillor representatives.
                for leader_rep in leader_reps:
                    yield self.person_data(leader_rep, division_id,
                                           division_name, 'Mayor',
                                           organization_name)
                for councillor_rep in councillor_reps:
                    yield self.person_data(councillor_rep, division_id,
                                           division_name, 'Councillor',
                                           organization_name)

        # Iterate through each organization.
        for organization in organizations.values():
            yield organization
Exemplo n.º 46
0
def spreadsheet(base='.', private_base='../represent-canada-private-data'):
    """
    Validate the spreadsheet for tracking progress on data collection.
    """
    expecteds = OrderedDict()

    # Append to `municipal_subdivisions` from `constants.py`.
    for division in Division.all('ca', from_csv=ocd_division_csv):
        if division.attrs['has_children']:
            municipal_subdivisions[type_id(division.id)] = division.attrs['has_children']

    expecteds['ocd-division/country:ca'] = default_expectation.copy()
    expecteds['ocd-division/country:ca'].update({
        'OCD': 'ocd-division/country:ca',
        'Geographic name': 'Canada',
    })

    # Create expectations for provinces and territories.
    for division in Division.all('ca', from_csv=ocd_division_csv):
        if division._type in ('province', 'territory'):
            expected = default_expectation.copy()
            expected.update({
                'OCD': division.id,
                'Geographic name': division.name,
                'Province or territory': type_id(division.id).upper(),
            })

            expecteds[division.id] = expected

    # Create expectations for census subdivisions.
    reader = csv_dict_reader('http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV', 'ISO-8859-1')
    for row in reader:
        code = row['Geographic code']

        if code == 'Note:':
            break

        division = Division.get('ocd-division/country:ca/csd:%s' % row['Geographic code'], from_csv=ocd_division_csv)

        expected = default_expectation.copy()
        expected.update({
            'OCD': division.id,
            'Geographic name': division.name,
            'Province or territory': province_or_territory_abbreviation(division.id),
            'Geographic type': division.attrs['classification'],
            'Population': row['Population, 2016'],
        })

        if code in municipal_subdivisions:
            if municipal_subdivisions[code] == 'N':
                expected['Shapefile?'] = 'N/A'
                expected['Contact'] = 'N/A'
                expected['Received via'] = 'N/A'
                expected['Last boundary'] = 'N/A'
                expected['Permission to distribute'] = 'N/A'
            elif municipal_subdivisions[code] == 'Y':
                expected['Shapefile?'] = 'Request'

        expecteds[division.id] = expected

    # Merge information from received data.
    for directory, permission_to_distribute in [(base, 'Y'), (private_base, 'N')]:
        boundaries.registry = {}
        for slug, config in registry(directory).items():
            if 'extra' in config:
                division_id = config['extra']['division_id']
                if division_id in expecteds:
                    license_path = os.path.join(dirname(config['file']), 'LICENSE.txt')
                    license_text = ''
                    if os.path.exists(license_path):
                        with open(license_path) as f:
                            license_text = f.read().rstrip('\n')

                    expected = expecteds[division_id]
                    expected['Shapefile?'] = 'Y'
                    expected['Permission to distribute'] = permission_to_distribute

                    if 'data_url' in config:
                        expected['Last boundary'] = 'N/A'
                    elif 'last_updated' in config:
                        expected['Last boundary'] = config['last_updated'].strftime('%-m/%-d/%Y')

                    if 'source_url' in config:
                        expected['Contact'] = 'N/A'
                        expected['Received via'] = 'online'
                    elif expected['Province or territory'] == 'SK' and expected['Geographic type'] == 'RM':
                        expected['Contact'] = 'Mikael Nagel\nMapping Technician\nTel: 1-306-569-2988 x205\nToll free: 1-800-663-6864\[email protected]'
                        expected['Received via'] = 'purchase'
                    else:
                        match = all_rights_reserved_terms_re.search(license_text)
                        if match:
                            expected['Contact'] = match.group(1)
                        expected['Received via'] = 'email'
                # The spreadsheet doesn't track borough boundaries.
                elif '/borough:' not in division_id:
                    sys.stderr.write('%-25s no record\n' % division_id)
            else:
                sys.stderr.write('%-25s no extra\n' % slug)

    reader = csv_dict_reader('https://docs.google.com/spreadsheets/d/1ihCIDc9EtvxF7kzPg3Yk6e928DN7JzaycH92IBYr0QU/pub?gid=25&single=true&output=csv', 'utf-8')

    actuals = set()
    for actual in reader:
        expected = expecteds[actual['OCD']]
        actuals.add(actual['OCD'])

        for key in actual.keys() - ('Population', 'URL', 'Request notes', 'Next boundary', 'Response notes'):
            e = expected[key]
            a = actual[key]

            # Note: Some of the following conditions are repetitive for readability.
            if e != a:
                # Change expectations for in-progress requests.
                if expected['Shapefile?'] == 'Request' and actual['Shapefile?'] == 'Requested' and (
                   # Request sent.
                   (key == 'Shapefile?' and e == 'Request' and a == 'Requested') or
                   # Contact found.
                   (key == 'Contact' and not e and a)):
                    continue

                # Some provinces and territories have missing expectations, in which case we defer to the spreadsheet.
                elif actual['Province or territory'] in ('ON', 'MB') and not expected['Shapefile?'] and (
                   # We determined that we needed to request boundaries.
                   (key == 'Shapefile?' and not e and a in ('Request', 'Requested')) or
                   # We determined that we needed to request boundaries, and did so.
                   (actual['Shapefile?'] == 'Requested' and key == 'Contact' and not e and a)):
                    continue

                # Contacts for private data are only stored in the spreadsheet.
                elif ((expected['Permission to distribute'] == 'N' and key == 'Contact' and not e and a) or
                      # MFIPPA receptions are only stored in the spreadsheet.
                      (key == 'Received via' and e == 'email' and a == 'MFIPPA')):
                    continue

                sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' % (key, actual['OCD'], e, a))

    for division_id in expecteds.keys() - actuals:
        record = expecteds[division_id]
        if record['Shapefile?'] != 'N/A':
            sys.stderr.write('%s\t%s\t%s\t%s\t%s\n' % (division_id, record['Geographic name'], record['Province or territory'], record['Geographic type'], record['Population']))

    for division_id in actuals - expecteds.keys():
        sys.stderr.write('Remove %s\n' % division_id)
Exemplo n.º 47
0
    def scrape(self):
        exclude_divisions = {
        }
        exclude_districts = {
            'Capital',
            'Capital F',
            'Capital G',
            'Capital H',
            'Central Coast B',
            'Central Okanagan East',
            'Central Okanagan West',
            'Comox Valley B',
            'Comox Valley C',
            'Islands Trust',
            'Kitimat-Stikine C',
            'Kootenay Boundary B',
            'Kootenay Boundary C',
            'Kootenay Boundary D',
            'Kootenay Boundary E',
            'Metro Vancouver A',
            'North Coast A',
            'North Coast C',
            'North Coast D',
            'North Coast E',
            'Okanagan-Similkameen I',
            'Okanagan-Similkameen Olalla Local Community Commission',
            'Qathet A',
            'Qathet B',
            'Qathet C',
            'Qathet D',
            'Qathet E',
        }
        expected_roles = {
            'candidate',
        }
        infixes = {
            'CY': 'City',
            'DM': 'District',
            'IGD': 'District',
            'IM': 'Municipal',
            'RGM': 'Regional',
            'T': 'Town',
            'VL': 'Village',
            'RDA': 'District',
        }
        duplicate_names = {
            'Rick Smith',
            'Sung Y Wong',
            'Elizabeth Taylor',
        }

        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children('csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('59'):
                if division.attrs['classification'] == 'IRI':
                    continue
                if division.name in names_to_ids:
                    names_to_ids[division.name] = None
                else:
                    names_to_ids[division.name] = division.id

        reader = self.csv_reader(COUNCIL_PAGE, header=True)
        reader.fieldnames = [field.lower() for field in reader.fieldnames]

        organizations = {}

        birth_date = 1900
        seen = set()

        rows = [row for row in reader]
        assert len(rows), 'No councillors found'
        for row in rows:
            name = row['full name']
            district_name = row['district name']

            if not any(row.values()) or name.lower() in ('', 'vacant') or district_name in exclude_districts:
                continue

            if row['district id']:
                division_id = 'ocd-division/country:ca/csd:{}'.format(row['district id'])
            else:
                division_id = names_to_ids[row['district name']]

            if division_id in exclude_divisions:
                continue
            if not division_id:
                raise Exception('unhandled collision: {}'.format(row['district name']))

            division = Division.get(division_id)

            division_name = division.name

            organization_name = '{} {} Council'.format(division_name, infixes[division.attrs['classification']])

            if division_id not in seen:
                seen.add(division_id)
                organizations[division_id] = Organization(name=organization_name, classification='government')
                organizations[division_id].add_source(COUNCIL_PAGE)

            organization = organizations[division_id]

            role = row['primary role']
            if role not in expected_roles:
                raise Exception('unexpected role: {}'.format(role))
            if row['district id']:
                district = format(division_id)
            else:
                district = division_name

            organization.add_post(role=role, label=district, division_id=division_id)

            p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            if row['source url']:
                p.add_source(row['source url'])

            if name in duplicate_names:
                p.birth_date = str(birth_date)
                birth_date += 1

            if row['email']:
                p.add_contact('email', row['email'])

            if row['phone']:
                p.add_contact('voice', row['phone'], 'legislature')

            if row['twitter']:
                p.add_link(row['twitter'])

            p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1])

            yield p

        for organization in organizations.values():
            yield organization
Exemplo n.º 48
0
def test_get():
    div = Division.get("ocd-division/country:de/state:by/cd:248")
    assert div.name == "Bad Kissingen"
    assert div.name in str(div)
Exemplo n.º 49
0
    def scrape(self):
        exclude_districts = {
            # Existing scrapers
            'Abbotsford',
            'Burnaby',
            'Coquitlam',
            'Kelowna',
            'Langley',
            'New Westminster',
            'Richmond',
            'Saanich',
            'Surrey',
            'Vancouver',
            'Victoria',
        }
        excluded_district_types = {
            'Regional District',
            'Indian Government District',
            'Islands Trust',
            'Mountain Resort Municipality',
        }
        division_corrections = {
            '100 Mile House': 'One Hundred Mile House',
        }

        processed_ids = set()
        exclude_divisions = {}
        processed_divisions = set()
        infixes = {
            'CY': 'City',
            'DM': 'District',
            'IGD': 'District',
            'IM': 'Municipal',
            'RGM': 'Regional',
            'T': 'Town',
            'VL': 'Village',
            'RDA': 'District',
        }
        organizations = {}
        # Create list mapping names to IDs.
        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children('csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('59'):
                if division.attrs['classification'] == 'IRI':
                    continue
                if division.name in names_to_ids:
                    names_to_ids[division.name] = None
                else:
                    names_to_ids[division.name] = division.id

        # Scrape list of municpalities.
        list_page = self.lxmlize(LIST_PAGE)
        municipalities = list_page.xpath('//select[@name="lgid"]/option')
        assert len(municipalities), 'No municipalities found'

        # Iterate through each municipality.
        for municipality in municipalities:
            municipality_text = municipality.text
            municipal_type = municipality_text[municipality_text.find('(') + 1:municipality_text.find(')')]
            if municipal_type in excluded_district_types:
                continue

            municipal_id = municipality.get('value')
            division_name = municipality_text.split(' (')[0]
            division_name = division_corrections.get(division_name, division_name)

            record_url = LIST_PAGE + '?stext=&type=ss&lgid=' + municipal_id + '&agencyid=+'

            # If we have a municipal ID, process that municipality.
            if municipal_id and municipal_id.strip():
                # Get division ID from municipal name and filter out duplicates or unknowns.
                if division_name in exclude_districts or division_name in processed_divisions:
                    continue
                division_id = names_to_ids[division_name]
                if not isinstance(division_id, str):
                    continue
                if division_id in exclude_divisions:
                    continue
                if division_id in processed_ids:
                    raise Exception('unhandled collision: {}'.format(division_id))
                division = Division.get(division_id)
                processed_divisions.add(division_name)

                # Get division name and create org.
                division_name = division.name
                organization_name = '{} {} Council'.format(division_name, infixes[division.attrs['classification']])
                if division_id not in processed_ids:
                    processed_ids.add(division_id)
                    organizations[division_id] = Organization(name=organization_name, classification='government')
                    organizations[division_id].add_source(record_url)
                organization = organizations[division_id]
                organization.add_post(role='Mayor', label=division_name, division_id=division_id)
                organization.add_post(role='Councillor', label=division_name, division_id=division_id)

                # Load records for municipality.
                municipal_page = self.lxmlize(record_url)
                number_of_records_text = municipal_page.xpath('//main/h4/text()')
                number_of_records = int(re.search(r'\d+', number_of_records_text[0]).group())

                # Collate mayor and councillor representatives on first page of records.
                leader_reps = municipal_page.xpath('//main/ol/li[contains(., "Mayor")][not(contains(., "Chief"))]')
                councillor_reps = municipal_page.xpath('//main/ol/li[contains(., "Councillor")]')

                # Iterate through additional pages of records if they exists adding reps.
                if number_of_records > 10:
                    quotient, remainder = divmod(number_of_records, 10)
                    number_of_pages = quotient + int(bool(remainder))
                    for i in range(2, number_of_pages + 1):
                        municipal_page = self.lxmlize(record_url + '&pn=' + str(i))
                        additional_leader_reps = municipal_page.xpath('//main/ol/li[contains(., "Mayor")][not(contains(., "Chief"))][not(contains(., "Assistant"))]')
                        leader_reps.extend(additional_leader_reps)
                        additional_councillor_reps = municipal_page.xpath('//main/ol/li[contains(., "Councillor")]')
                        councillor_reps.extend(additional_councillor_reps)

                # Create person records for all mayor and councillor representatives.
                for leader_rep in leader_reps:
                    yield self.person_data(leader_rep, division_id, division_name, 'Mayor', organization_name)
                for councillor_rep in councillor_reps:
                    yield self.person_data(councillor_rep, division_id, division_name, 'Councillor', organization_name)

        # Iterate through each organization.
        for organization in organizations.values():
            yield organization
Exemplo n.º 50
0
def spreadsheet(base='.', private_base='../represent-canada-private-data'):
    """
    Validate the spreadsheet for tracking progress on data collection.
    """
    expecteds = OrderedDict()

    # Append to `municipal_subdivisions` from `constants.py`.
    for division in Division.all('ca', from_csv=ocd_division_csv):
        if division.attrs['has_children']:
            municipal_subdivisions[type_id(
                division.id)] = division.attrs['has_children']

    expecteds['ocd-division/country:ca'] = default_expectation.copy()
    expecteds['ocd-division/country:ca'].update({
        'OCD': 'ocd-division/country:ca',
        'Geographic name': 'Canada',
    })

    # Create expectations for provinces and territories.
    for division in Division.all('ca', from_csv=ocd_division_csv):
        if division._type in ('province', 'territory'):
            expected = default_expectation.copy()
            expected.update({
                'OCD':
                division.id,
                'Geographic name':
                division.name,
                'Province or territory':
                type_id(division.id).upper(),
            })

            expecteds[division.id] = expected

    # Create expectations for census subdivisions.
    reader = csv_dict_reader(
        'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV',
        'ISO-8859-1')
    for row in reader:
        code = row['Geographic code']

        if code == 'Note:':
            break

        division = Division.get('ocd-division/country:ca/csd:%s' %
                                row['Geographic code'],
                                from_csv=ocd_division_csv)

        expected = default_expectation.copy()
        expected.update({
            'OCD':
            division.id,
            'Geographic name':
            division.name,
            'Province or territory':
            province_or_territory_abbreviation(division.id),
            'Geographic type':
            division.attrs['classification'],
            'Population':
            row['Population, 2016'],
        })

        if code in municipal_subdivisions:
            if municipal_subdivisions[code] == 'N':
                expected['Shapefile?'] = 'N/A'
                expected['Contact'] = 'N/A'
                expected['Received via'] = 'N/A'
                expected['Last boundary'] = 'N/A'
                expected['Permission to distribute'] = 'N/A'
            elif municipal_subdivisions[code] == 'Y':
                expected['Shapefile?'] = 'Request'

        expecteds[division.id] = expected

    # Merge information from received data.
    for directory, permission_to_distribute in [(base, 'Y'),
                                                (private_base, 'N')]:
        boundaries.registry = {}
        for slug, config in registry(directory).items():
            if 'extra' in config:
                division_id = config['extra']['division_id']
                if division_id in expecteds:
                    license_path = os.path.join(dirname(config['file']),
                                                'LICENSE.txt')
                    license_text = ''
                    if os.path.exists(license_path):
                        with open(license_path) as f:
                            license_text = f.read().rstrip('\n')

                    expected = expecteds[division_id]
                    expected['Shapefile?'] = 'Y'
                    expected[
                        'Permission to distribute'] = permission_to_distribute

                    if 'data_url' in config:
                        expected['Last boundary'] = 'N/A'
                    elif 'last_updated' in config:
                        expected['Last boundary'] = config[
                            'last_updated'].strftime('%-m/%-d/%Y')

                    if 'source_url' in config:
                        expected['Contact'] = 'N/A'
                        expected['Received via'] = 'online'
                    elif expected['Province or territory'] == 'SK' and expected[
                            'Geographic type'] == 'RM':
                        expected[
                            'Contact'] = 'Mikael Nagel\nMapping Technician\nTel: 1-306-569-2988 x205\nToll free: 1-800-663-6864\[email protected]'
                        expected['Received via'] = 'purchase'
                    else:
                        match = all_rights_reserved_terms_re.search(
                            license_text)
                        if match:
                            expected['Contact'] = match.group(1)
                        expected['Received via'] = 'email'
                # The spreadsheet doesn't track borough boundaries.
                elif '/borough:' not in division_id:
                    sys.stderr.write('%-25s no record\n' % division_id)
            else:
                sys.stderr.write('%-25s no extra\n' % slug)

    reader = csv_dict_reader(
        'https://docs.google.com/spreadsheets/d/1ihCIDc9EtvxF7kzPg3Yk6e928DN7JzaycH92IBYr0QU/pub?gid=25&single=true&output=csv',
        'utf-8')

    actuals = set()
    for actual in reader:
        expected = expecteds[actual['OCD']]
        actuals.add(actual['OCD'])

        for key in actual.keys() - ('Population', 'URL', 'Request notes',
                                    'Next boundary', 'Response notes'):
            e = expected[key]
            a = actual[key]

            # Note: Some of the following conditions are repetitive for readability.
            if e != a:
                # Change expectations for in-progress requests.
                if expected['Shapefile?'] == 'Request' and actual[
                        'Shapefile?'] == 'Requested' and (
                            # Request sent.
                            (key == 'Shapefile?' and e == 'Request'
                             and a == 'Requested') or
                            # Contact found.
                            (key == 'Contact' and not e and a)):
                    continue

                # Some provinces and territories have missing expectations, in which case we defer to the spreadsheet.
                elif actual['Province or territory'] in (
                        'ON', 'MB'
                ) and not expected['Shapefile?'] and (
                        # We determined that we needed to request boundaries.
                    (key == 'Shapefile?' and not e
                     and a in ('Request', 'Requested')) or
                        # We determined that we needed to request boundaries, and did so.
                    (actual['Shapefile?'] == 'Requested' and key == 'Contact'
                     and not e and a)):
                    continue

                # Contacts for private data are only stored in the spreadsheet.
                elif ((expected['Permission to distribute'] == 'N'
                       and key == 'Contact' and not e and a) or
                      # MFIPPA receptions are only stored in the spreadsheet.
                      (key == 'Received via' and e == 'email' and a == 'MFIPPA'
                       )):
                    continue

                sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' %
                                 (key, actual['OCD'], e, a))

    for division_id in expecteds.keys() - actuals:
        record = expecteds[division_id]
        if record['Shapefile?'] != 'N/A':
            sys.stderr.write('%s\t%s\t%s\t%s\t%s\n' %
                             (division_id, record['Geographic name'],
                              record['Province or territory'],
                              record['Geographic type'], record['Population']))

    for division_id in actuals - expecteds.keys():
        sys.stderr.write('Remove %s\n' % division_id)
Exemplo n.º 51
0
    def scrape(self):
        exclude_divisions = {
            'ocd-division/country:ca/csd:1301006',  # Saint John
            'ocd-division/country:ca/csd:1307022',  # Moncton
            'ocd-division/country:ca/csd:1310032',  # Fredericton
        }
        expected_roles = {
            'Mayor',
            'Councillor',
        }
        unique_roles = {
            'Mayor',
        }
        classifications = {
            'Cities': 'City',
            'Towns': 'Town',
            'Villages': 'Village',
            'Rural Communities': 'Community',
            'Regional Municipality': 'Regional',
        }
        corrections = {
            'Beaubassin-est/East': 'Beaubassin East',
            'Lac-Baker': 'Lac Baker',
            'Saint-François-de-Madawaska': 'Saint-François de Madawaska',
            'Saint-Hilaire': 'Saint Hilaire',
        }
        unknown_names = {
            'Haut-Madawaska',  # incorporated after Census 2016
        }
        duplicate_names = {
            'Denis Savoie',
            'Josée Levesque',
            'Luc Levesque',
        }

        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children('csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('13'):
                if division.attrs['classification'] == 'P':
                    continue
                if division.name in names_to_ids:
                    raise Exception('unhandled collision: {}'.format(division.name))
                else:
                    names_to_ids[division.name] = division.id

        page = self.lxmlize(COUNCIL_PAGE)
        list_links = page.xpath('//div[@id="sidebar"]//div[contains(@class, "list")][1]//a')

        birth_date = 1900
        seen = set()

        assert len(list_links), 'No list items found'
        for list_link in list_links:
            page = self.lxmlize(list_link.attrib['href'])
            detail_urls = page.xpath('//td[1]//@href')

            assert len(detail_urls), 'No municipalities found'
            for detail_url in detail_urls:
                page = self.lxmlize(detail_url, encoding='utf-8')
                division_name = re.sub(r'\ASt\b\.?', 'Saint', page.xpath('//h1/text()')[0].split(' - ', 1)[1])
                division_name = corrections.get(division_name, division_name)

                if division_name in unknown_names:
                    continue
                division_id = names_to_ids[division_name]
                if division_id in exclude_divisions:
                    continue
                if division_id in seen:
                    raise Exception('unhandled collision: {}'.format(division_id))

                seen.add(division_id)
                division_name = Division.get(division_id).name
                organization_name = '{} {} Council'.format(division_name, classifications[list_link.text])
                organization = Organization(name=organization_name, classification='government')
                organization.add_source(detail_url)

                address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()'))

                contacts = page.xpath('//div[@class="left_contents"]/p[contains(., "Contact")]/text()')
                phone = contacts[0].split(':')[1]
                if len(contacts) > 1:
                    fax = contacts[1].split(':')[1]
                email = self.get_email(page, '//div[@class="left_contents"]', error=False)

                url = page.xpath('//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]')
                if url:
                    url = url[0]

                groups = page.xpath('//div[contains(@class, "right_contents")]/p')
                assert len(groups), 'No groups found'
                for p in groups:
                    role = p.xpath('./b/text()')[0].rstrip('s')
                    if role not in expected_roles:
                        raise Exception('unexpected role: {}'.format(role))

                    councillors = p.xpath('./text()')
                    assert len(councillors), 'No councillors found'
                    for seat_number, name in enumerate(councillors, 1):
                        if 'vacant' in name.lower():
                            continue

                        if role in unique_roles:
                            district = division_name
                        else:
                            district = '{} (seat {})'.format(division_name, seat_number)

                        organization.add_post(role=role, label=district, division_id=division_id)

                        p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role)
                        p.add_source(COUNCIL_PAGE)
                        p.add_source(list_link.attrib['href'])
                        p.add_source(detail_url)

                        if name in duplicate_names:
                            p.birth_date = str(birth_date)
                            birth_date += 1

                        p.add_contact('address', address, 'legislature')
                        # @see https://en.wikipedia.org/wiki/Area_code_506
                        if phone:
                            p.add_contact('voice', phone, 'legislature', area_code=506)
                        if fax:
                            p.add_contact('fax', fax, 'legislature', area_code=506)
                        if email:
                            p.add_contact('email', email)
                        if url:
                            p.add_link(url)

                        p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1])

                        yield p

                yield organization
Exemplo n.º 52
0
def get_definition(division_id, aggregation=False):
    if not ocdid_to_type_name_map:
        # Map census division type codes to names.
        census_division_type_names = {}
        document = lxml.html.fromstring(requests.get('https://www12.statcan.gc.ca/census-recensement/2011/ref/dict/table-tableau/table-tableau-4-eng.cfm').content)
        for abbr in document.xpath('//table/tbody/tr/th[1]/abbr'):
            census_division_type_names[abbr.text_content()] = re.sub(' ?/.+\Z', '', abbr.attrib['title'])

        # Map census subdivision type codes to names.
        census_subdivision_type_names = {}
        document = lxml.html.fromstring(requests.get('https://www12.statcan.gc.ca/census-recensement/2011/ref/dict/table-tableau/table-tableau-5-eng.cfm').content)
        for abbr in document.xpath('//table/tbody/tr/th[1]/abbr'):
            census_subdivision_type_names[abbr.text_content()] = re.sub(' ?/.+\Z', '', abbr.attrib['title'])

        # Map OCD identifiers to census types.
        for division in Division.all('ca'):
            if division._type == 'cd':
                ocdid_to_type_name_map[division.id] = census_division_type_names[division.attrs['classification']]
            elif division._type == 'csd':
                ocdid_to_type_name_map[division.id] = census_subdivision_type_names[division.attrs['classification']]

    codes = province_and_territory_codes()
    division = Division.get(division_id)

    expected = {}
    vowels = ('A', 'À', 'E', 'É', 'I', 'Î', 'O', 'Ô', 'U')

    sections = division_id.split('/')
    ocd_type, ocd_type_id = sections[-1].split(':')

    # Determine the module name, name and classification.
    if ocd_type == 'country':
        expected['module_name'] = 'ca'
        expected['name'] = 'Parliament of Canada'
    elif ocd_type in ('province', 'territory'):
        pattern = 'ca_{}_municipalities' if aggregation else 'ca_{}'
        expected['module_name'] = pattern.format(ocd_type_id)
        if aggregation:
            expected['name'] = '{} Municipalities'.format(division.name)
        elif ocd_type_id in ('nl', 'ns'):
            expected['name'] = '{} House of Assembly'.format(division.name)
        elif ocd_type_id == 'qc':
            expected['name'] = 'Assemblée nationale du Québec'
        else:
            expected['name'] = 'Legislative Assembly of {}'.format(division.name)
    elif ocd_type == 'cd':
        province_or_territory_type_id = codes[ocd_type_id[:2]].split(':')[-1]
        expected['module_name'] = 'ca_{}_{}'.format(province_or_territory_type_id, slug(division.name))
        name_infix = ocdid_to_type_name_map[division_id]
        if name_infix == 'Regional municipality':
            name_infix = 'Regional'
        expected['name'] = '{} {} Council'.format(division.name, name_infix)
    elif ocd_type == 'csd':
        province_or_territory_type_id = codes[ocd_type_id[:2]].split(':')[-1]
        expected['module_name'] = 'ca_{}_{}'.format(province_or_territory_type_id, slug(division.name))
        if ocd_type_id[:2] == '24':
            if division.name[0] in vowels:
                expected['name'] = "Conseil municipal d'{}".format(division.name)
            else:
                expected['name'] = "Conseil municipal de {}".format(division.name)
        else:
            name_infix = ocdid_to_type_name_map[division_id]
            if name_infix in ('Municipality', 'Specialized municipality'):
                name_infix = 'Municipal'
            elif name_infix == 'District municipality':
                name_infix = 'District'
            elif name_infix == 'Regional municipality':
                name_infix = 'Regional'
            expected['name'] = '{} {} Council'.format(division.name, name_infix)
    elif ocd_type == 'arrondissement':
        census_subdivision_type_id = sections[-2].split(':')[-1]
        province_or_territory_type_id = codes[census_subdivision_type_id[:2]].split(':')[-1]
        expected['module_name'] = 'ca_{}_{}_{}'.format(province_or_territory_type_id, slug(Division.get('/'.join(sections[:-1])).name), slug(division.name))
        if division.name[0] in vowels:
            expected['name'] = "Conseil d'arrondissement d'{}".format(division.name)
        elif division.name[:3] == 'Le ':
            expected['name'] = "Conseil d'arrondissement du {}".format(division.name[3:])
        else:
            expected['name'] = "Conseil d'arrondissement de {}".format(division.name)
    else:
        raise Exception('{}: Unrecognized OCD type {}'.format(division_id, ocd_type))

    # Determine the class name.
    class_name_parts = re.split('[ -]', re.sub("[—–]", '-', re.sub("['.]", '', division.name)))
    expected['class_name'] = unidecode(text_type(''.join(word if re.match('[A-Z]', word) else word.capitalize() for word in class_name_parts)))
    if aggregation:
        expected['class_name'] += 'Municipalities'

    # Determine the url.
    expected['url'] = division.attrs['url']

    # Determine the division name.
    expected['division_name'] = division.name

    return expected
Exemplo n.º 53
0
    def handle(self, *args, **options):
        mappings = {}

        divisions = list(Division.all('ca'))  # cache all divisions
        for obj in requests.get('https://represent.opennorth.ca/boundary-sets/?limit=0').json()['objects']:
            slug = obj['url'].split('/')[2]
            if obj['url'] in ('/boundary-sets/census-divisions/', '/boundary-sets/census-subdivisions/'):
                continue
            if obj['url'] in ('/boundary-sets/federal-electoral-districts/', '/boundary-sets/federal-electoral-districts-next-election/'):
                prefix = 'ocd-division/country:ca/ed:'
                boundary_key = 'external_id'
            else:
                url = 'https://represent.opennorth.ca{}'.format(obj['url'])
                boundary_set = requests.get(url).json()

                if boundary_set['extra'].get('ocd_division'):
                    division_id = boundary_set['extra']['ocd_division']
                elif boundary_set['extra'].get('geographic_code'):
                    geographic_code = boundary_set['extra']['geographic_code']
                    geographic_code_length = len(geographic_code)
                    if geographic_code_length == 7:
                        division_id = 'ocd-division/country:ca/csd:{}'.format(geographic_code)
                    elif geographic_code_length == 4:
                        division_id = 'ocd-division/country:ca/cd:{}'.format(geographic_code)
                    elif geographic_code_length == 2:
                        division_id = next((division for division in divisions if division.attrs['sgc'] == geographic_code), None).id
                    else:
                        log.error('Unrecognized geographic_code {}'.format(geographic_code))
                        continue

                try:
                    division = Division.get(division_id)
                    if division._type == 'borough':
                        division = division.parent
                        division_id = division.id
                        exclude = ('place', 'borough')
                    elif 'boroughs' in obj['name']:
                        exclude = ('place', 'district')
                    elif 'districts' in obj['name']:
                        exclude = ('place', 'borough')
                    else:
                        exclude = ('place',)

                    subtypes = set(child._type for child in division.children() if child._type not in exclude)
                    if len(subtypes) == 0:
                        log.warn('No subtypes for {}'.format(division_id))
                        continue
                    elif len(subtypes) > 1:
                        log.warn('>1 subtypes for {}: {}'.format(division_id, list(subtypes)))
                        continue
                    else:
                        prefix = '{}/{}:'.format(division_id, subtypes.pop())

                    boundary_key = 'external_id'
                    for child in division.children():
                        if child._type not in exclude:
                            type_id = child.id.rsplit(':', 1)[1]
                            if not numeric_re.search(type_id):
                                if len(type_id) in (1, 3):  # Lunenburg 1-letter identifiers, BC uses 3-letter identifiers
                                    boundary_key = 'lower'
                                else:
                                    boundary_key = 'matcher'
                                break
                except KeyError:
                    log.warn('No division for {}'.format(url))

            mappings[slug] = {
                'key': 'id',
                'prefix': prefix,
                'boundary_key': boundary_key,
            }

        with open(os.path.join(settings.BASE_DIR, 'mappings.py'), 'w') as f:
            f.write("# DO NOT EDIT THIS AUTO-GENERATED FILE\n")
            f.write("import re\n\n")

            f.write("from django.template.defaultfilters import slugify\n\n")

            f.write("leading_zero_re = re.compile(r'^0+')\n")
            f.write("invalid_re = re.compile(r'[^a-z\d._~-]')\n")
            f.write("leading_district_re = re.compile(r'^District ')\n")
            f.write("lower = lambda boundary: boundary['external_id'].lower()\n\n")
            f.write("matcher = lambda boundary: leading_district_re.sub('', leading_zero_re.sub('', invalid_re.sub('~', boundary['name'].lower().replace(' ', '_'))))\n\n")

            f.write('IMAGO_BOUNDARY_MAPPINGS = {\n')
            for slug, data in OrderedDict(sorted(mappings.items())).items():
                f.write("    '{}': {{\n".format(slug))
                for key, value in OrderedDict(sorted(data.items())).items():
                    if key == 'boundary_key' and value in ('lower', 'matcher'):
                        f.write("        '{}': {},\n".format(key, value))
                    else:
                        f.write("        '{}': '{}',\n".format(key, value))
                f.write("    },\n")
            f.write('}\n')