예제 #1
0
  def handle(self, *args, **options):
    def save(key, body):
      k = Key(bucket)
      k.key = key
      k.set_contents_from_string(body)
      k.set_acl('public-read')

    sys.path.append(os.path.abspath('scrapers'))

    url = os.getenv('MONGOHQ_URL', 'mongodb://localhost:27017/pupa')
    parsed = urlsplit(url)
    _configure_db(url, parsed.port, parsed.path[1:])

    bucket = S3Connection().get_bucket('represent.opennorth.ca')

    names = {
      'Parliament of Canada': 'house-of-commons',
      'Legislative Assembly of Alberta': 'alberta-legislature',
      'Legislative Assembly of British Columbia': 'bc-legislature',
      'Legislative Assembly of Manitoba': 'manitoba-legislature',
      'Legislative Assembly of New Brunswick': 'new-brunswick-legislature',
      'Newfoundland and Labrador House of Assembly': 'newfoundland-labrador-legislature',
      'Nova Scotia House of Assembly': 'nova-scotia-legislature',
      'Legislative Assembly of Ontario': 'ontario-legislature',
      'Legislative Assembly of Prince Edward Island': 'pei-legislature',
      'Assemblée nationale du Québec': 'quebec-assemblee-nationale',
      'Legislative Assembly of Saskatchewan': 'saskatchewan-legislature',
    }

    default_headers = [
      'District name',
      'Elected office',
      'Name',
      'First name',
      'Last name',
      'Gender',
      'Party name',
      'Email',
      'URL',
      'Photo URL',
      'Personal URL',
    ]
    office_headers = [
      'Office type',
      'Address',
      'Phone',
      'Fax',
    ]

    all_rows = []
    max_offices_count = 0

    reports = Report.objects.filter(exception='').exclude(module__endswith='_candidates').exclude(module__endswith='_municipalities').order_by('module')
    for report in reports:
      try:
        module = importlib.import_module(report.module)
        for obj in module.__dict__.values():
          jurisdiction_id = getattr(obj, 'jurisdiction_id', None)
          if jurisdiction_id:  # We've found the module.
            name = getattr(obj, 'name', None)

            rows = []
            offices_count = 0

            # Exclude party memberships.
            for membership in db.memberships.find({'jurisdiction_id': jurisdiction_id, 'role': {'$nin': ['member', 'candidate']}}):
              organization = db.organizations.find_one({'_id': membership['organization_id']})
              person = db.people.find_one({'_id': membership['person_id']})

              party_membership = db.memberships.find_one({'jurisdiction_id': jurisdiction_id, 'role': 'member', 'person_id': membership['person_id']})
              if party_membership:
                party_name = db.organizations.find_one({'_id': party_membership['organization_id']})['name']
              else:
                party_name = None

              if person['gender'] == 'male':
                gender = 'M'
              elif person['gender'] == 'female':
                gender = 'F'
              else:
                gender = None

              if ' ' in person['name']:
                first_name, last_name = person['name'].rsplit(' ', 1)
              else:
                first_name, last_name = None, person['name']

              # @see http://represent.opennorth.ca/api/#fields
              row = [
                person['post_id'], # District name
                membership['role'], # Elected office
                person['name'], # Name
                first_name, # First name
                last_name, # Last name
                gender, # Gender
                party_name, # Party name
                next((contact_detail['value'] for contact_detail in membership['contact_details'] if contact_detail['type'] == 'email'), None), # Email
                person['sources'][-1]['url'] if len(person['sources']) > 1 else None, # URL
                person['image'], # Photo URL
                get_personal_url(person), # Personal URL
              ]

              offices = get_offices(membership)
              if len(offices) > offices_count:
                offices_count = len(offices)

              for office in offices:
                for key in ('type', 'postal', 'tel', 'fax'):
                  row.append(office.get(key))

              # If the person is associated to multiple boundaries.
              if re.search(r'\AWards \d(?:(?:,| & | and )\d+)+\Z', person['post_id']):
                for district_id in re.findall(r'\d+', person['post_id']):
                  row = row[:]
                  row[0] = 'Ward %s' % district_id
                  rows.append(row)
              else:
                rows.append(row)

            rows.sort()

            headers = default_headers[:]
            for _ in range(offices_count):
              headers += office_headers

            if name in names:
              slug = names[name]
            else:
              slug = slugify(name)

            io = StringIO()
            body = UnicodeWriter(io, encoding='windows-1252')
            body.writerow(headers)
            body.writerows(rows)
            save('csv/%s.csv' % slug, io.getvalue())

            if offices_count > max_offices_count:
              max_offices_count = offices_count

            for row in rows:
              row.insert(0, name)
              all_rows.append(row)
      except ImportError:
        report.delete()  # delete reports for old modules

    headers = ['Organization'] + default_headers
    for _ in range(max_offices_count):
      headers += office_headers

    io = StringIO()
    body = UnicodeWriter(io, encoding='windows-1252')
    body.writerow(headers)
    body.writerows(all_rows)
    save('csv/complete.csv', io.getvalue())
예제 #2
0
    def handle(self, *args, **options):
        def save(key, body):
            k = Key(bucket)
            k.key = key
            k.set_contents_from_string(body)
            k.set_acl('public-read')

        sys.path.append(os.path.abspath('scrapers'))

        bucket = S3Connection().get_bucket('represent.opennorth.ca')

        names = {
            'Parliament of Canada': 'house-of-commons',
            'Legislative Assembly of Alberta': 'alberta-legislature',
            'Legislative Assembly of British Columbia': 'bc-legislature',
            'Legislative Assembly of Manitoba': 'manitoba-legislature',
            'Legislative Assembly of New Brunswick':
            'new-brunswick-legislature',
            'Newfoundland and Labrador House of Assembly':
            'newfoundland-labrador-legislature',
            'Nova Scotia House of Assembly': 'nova-scotia-legislature',
            'Legislative Assembly of Ontario': 'ontario-legislature',
            'Legislative Assembly of Prince Edward Island': 'pei-legislature',
            'Assemblée nationale du Québec': 'quebec-assemblee-nationale',
            'Legislative Assembly of Saskatchewan': 'saskatchewan-legislature',
        }

        default_headers = [
            'District name',
            'Primary role',
            'Name',  # not in CSV schema
            'First name',
            'Last name',
            'Gender',
            'Party name',
            'Email',
            'Photo URL',
            'Source URL',
            'Website',
            'Facebook',
            'Instagram',
            'Twitter',
            'LinkedIn',
            'YouTube',
        ]
        office_headers = [
            'Office type',  # not in CSV schema
            'Address',  # not in CSV schema
            'Phone',
            'Fax',
        ]

        all_rows = []
        max_offices_count = 0

        reports = Report.objects.filter(exception='').exclude(
            module__endswith='_candidates').exclude(
                module__endswith='_municipalities').order_by('module')
        for report in reports:
            try:
                metadata = module_name_to_metadata(report.module)

                rows = []
                offices_count = 0

                # Exclude party memberships.
                queryset = Membership.objects.filter(
                    organization__jurisdiction_id=metadata['jurisdiction_id']
                ).exclude(role__in=('member', 'candidate'))
                for membership in queryset.prefetch_related(
                        'contact_details', 'person', 'person__links',
                        'person__sources'):
                    person = membership.person

                    try:
                        party_name = Membership.objects.get(
                            organization__classification='party',
                            role='member',
                            person=person).organization.name
                    except Membership.DoesNotExist:
                        party_name = None

                    facebook = None
                    instagram = None
                    linkedin = None
                    twitter = None
                    youtube = None
                    for link in person.links.all():
                        domain = '.'.join(
                            urlsplit(link.url).netloc.split('.')[-2:])
                        if domain in ('facebook.com', 'fb.com'):
                            facebook = link.url
                        elif domain == 'instagram.com':
                            instagram = link.url
                        elif domain == 'linkedin.com':
                            linkedin = link.url
                        elif domain == 'twitter.com':
                            twitter = link.url
                        elif domain == 'youtube.com':
                            youtube = link.url

                    if person.gender == 'male':
                        gender = 'M'
                    elif person.gender == 'female':
                        gender = 'F'
                    else:
                        gender = None

                    if ' ' in person.name:
                        first_name, last_name = person.name.rsplit(' ', 1)
                    else:
                        first_name, last_name = None, person.name

                    # @see https://represent.opennorth.ca/api/#fields
                    sources = list(person.sources.all())
                    row = [
                        remove_suffix_re.sub(
                            '', membership.post.label),  # District name
                        membership.role,  # Elected office
                        person.name,  # Name
                        first_name,  # First name
                        last_name,  # Last name
                        gender,  # Gender
                        party_name,  # Party name
                        next((contact_detail.value for contact_detail in
                              membership.contact_details.all()
                              if contact_detail.type == 'email'),
                             None),  # Email
                        person.image,  # Photo URL
                        sources[-1].url
                        if len(sources) > 1 else None,  # Source URL
                        get_personal_url(person),  # Website
                        facebook,  # Facebook
                        instagram,  # Instagram
                        twitter,  # Twitter
                        linkedin,  # LinkedIn
                        youtube,  # YouTube
                    ]

                    offices = get_offices(membership)
                    if len(offices) > offices_count:
                        offices_count = len(offices)

                    for office in offices:
                        for key in ('type', 'postal', 'tel', 'fax'):
                            row.append(office.get(key))

                    # If the person is associated to multiple boundaries.
                    if re.search(r'\AWards\b', membership.post.label):
                        for district_id in re.findall(r'\d+',
                                                      membership.post.label):
                            row = row[:]
                            row[0] = 'Ward %s' % district_id
                            rows.append(row)
                    else:
                        rows.append(row)

                rows.sort()

                headers = default_headers[:]
                for _ in range(offices_count):
                    headers += office_headers

                name = metadata['name']
                if name in names:
                    slug = names[name]
                else:
                    slug = slugify(name)

                io = StringIO()
                body = csv.writer(io)
                body.writerow(headers)
                body.writerows(rows)
                save('csv/%s.csv' % slug,
                     codecs.encode(io.getvalue(), 'windows-1252'))

                if offices_count > max_offices_count:
                    max_offices_count = offices_count

                for row in rows:
                    row.insert(0, name)
                    all_rows.append(row)
            except ImportError:
                report.delete()  # delete reports for old modules

        headers = ['Organization'] + default_headers
        for _ in range(max_offices_count):
            headers += office_headers

        io = StringIO()
        body = csv.writer(io)
        body.writerow(headers)
        body.writerows(all_rows)
        save('csv/complete.csv', codecs.encode(io.getvalue(), 'windows-1252'))
예제 #3
0
def represent(request, module_name):
    sys.path.append(os.path.abspath('scrapers'))

    metadata = module_name_to_metadata(module_name)

    representatives = []

    # Exclude party memberships.
    queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id'])

    if module_name.endswith('_candidates'):
        # Include only candidates.
        queryset.filter(role='candidate')
    else:
        # Exclude candidates and party memberships.
        queryset.exclude(role__in=('member', 'candidate'))

    for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources', 'post'):
        person = membership.person

        # Not sure why this is necessary.
        if not isinstance(membership.extras, dict):
            membership.extras = json.loads(membership.extras)
        if not isinstance(person.extras, dict):
            person.extras = json.loads(person.extras)

        try:
            party_name = Membership.objects.select_related('organization').get(organization__classification='party', role='member', person=person).organization.name
        except Membership.DoesNotExist:
            party_name = None

        if person.gender == 'male':
            gender = 'M'
        elif person.gender == 'female':
            gender = 'F'
        else:
            gender = None

        # Candidates only.
        incumbent = person.extras.pop('incumbent', None)

        # @see https://represent.opennorth.ca/api/#fields
        representative = {
            'name': person.name,
            'elected_office': membership.role,
            'party_name': party_name,
            'email': next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None),
            'photo_url': person.image or None,
            'personal_url': get_personal_url(person),
            'gender': gender,
            'offices': json.dumps(get_offices(membership)),
            'extra': json.dumps(get_extra(person)),
        }

        sources = list(person.sources.all())

        # The first URL ought to be the most generic source.
        representative['source_url'] = sources[0].url

        if len(sources) > 1:
            # The last URL ought to be the most specific source.
            representative['url'] = sources[-1].url

        if incumbent:
            representative['incumbent'] = True

        match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label)

        # If the person is part of Peel Regional Council.
        if match:
            parent = Division.objects.get(subtype1='csd', subtype2='', name=match.group(1))
            division = Division.objects.get(subtype1='csd', subid1=parent.subid1, name=match.group(2))
            boundary_set_slug = '{}-wards'.format(parent.name.lower())
            representative['district_name'] = membership.post.label
            representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(boundary_set_slug, division.subid2)
            representatives.append(representative)

        # If the person is associated to multiple boundaries.
        elif re.search(r'^Wards\b', membership.post.label):
            for district_id in re.findall(r'\d+', membership.post.label):
                representative = representative.copy()
                representative['district_id'] = district_id
                representative['district_name'] = 'Ward {}'.format(district_id)
                representatives.append(representative)

        else:
            division_id = metadata['division_id']

            if re.search('^ocd-division/country:ca/csd:(\d{7})\Z', division_id):
                geographic_code = division_id[-7:]
            elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z', division_id):
                geographic_code = division_id[-4:]
            else:
                geographic_code = None

            post_label = remove_suffix_re.sub('', membership.post.label)

            # If the post label is numeric.
            if re.search(r'^\d+\Z', post_label):
                representative['district_id'] = post_label

            # If the person has a boundary URL.
            elif 'boundary_url' in membership.extras:
                representative['district_name'] = post_label
                representative['boundary_url'] = membership.extras['boundary_url']

            # If the post label is a Census geographic name.
            elif post_label == metadata['division_name'] and geographic_code:
                representative['district_name'] = post_label
                if len(geographic_code) == 7:
                    representative['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(geographic_code)
                elif len(geographic_code) == 4:
                    representative['boundary_url'] = '/boundaries/census-divisions/{}/'.format(geographic_code)

            else:
                representative['district_name'] = post_label
                district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z', post_label)
                if district_id:
                    representative['district_id'] = district_id.group(1)

            representatives.append(representative)

    return HttpResponse(json.dumps(representatives), content_type='application/json')
예제 #4
0
        def process(report, *, candidates=False):
            rows = []
            offices_count = 0

            try:
                metadata = module_name_to_metadata(report.module)

                # Exclude party memberships.
                queryset = Membership.objects.filter(
                    organization__jurisdiction_id=metadata['jurisdiction_id'])
                if candidates:
                    queryset = queryset.filter(role='candidate')
                else:
                    queryset = queryset.exclude(role__in=('member',
                                                          'candidate'))

                for membership in queryset.prefetch_related(
                        'contact_details', 'person', 'person__links',
                        'person__sources'):
                    person = membership.person

                    try:
                        party_name = Membership.objects.get(
                            organization__classification='party',
                            role='member',
                            person=person).organization.name
                    except Membership.DoesNotExist:
                        party_name = None

                    facebook = None
                    instagram = None
                    linkedin = None
                    twitter = None
                    youtube = None
                    for link in person.links.all():
                        domain = '.'.join(
                            urlsplit(link.url).netloc.split('.')[-2:])
                        if domain in ('facebook.com', 'fb.com'):
                            facebook = link.url
                        elif domain == 'instagram.com':
                            instagram = link.url
                        elif domain == 'linkedin.com':
                            linkedin = link.url
                        elif domain == 'twitter.com':
                            twitter = link.url
                        elif domain == 'youtube.com':
                            youtube = link.url

                    if person.gender == 'male':
                        gender = 'M'
                    elif person.gender == 'female':
                        gender = 'F'
                    else:
                        gender = None

                    if ' ' in person.name:
                        first_name, last_name = person.name.rsplit(' ', 1)
                    else:
                        first_name, last_name = None, person.name

                    # @see https://represent.opennorth.ca/api/#fields
                    sources = list(person.sources.all())
                    row = [
                        remove_suffix_re.sub(
                            '', membership.post.label),  # District name
                        membership.role,  # Elected office
                        person.name,  # Name
                        first_name,  # First name
                        last_name,  # Last name
                        gender,  # Gender
                        party_name,  # Party name
                        next((contact_detail.value for contact_detail in
                              membership.contact_details.all()
                              if contact_detail.type == 'email'),
                             None),  # Email
                        person.image,  # Photo URL
                        sources[-1].url
                        if len(sources) > 1 else None,  # Source URL
                        get_personal_url(person),  # Website
                        facebook,  # Facebook
                        instagram,  # Instagram
                        twitter,  # Twitter
                        linkedin,  # LinkedIn
                        youtube,  # YouTube
                    ]

                    offices = get_offices(membership)
                    if len(offices) > offices_count:
                        offices_count = len(offices)

                    for office in offices:
                        for key in ('type', 'postal', 'tel', 'fax'):
                            row.append(office.get(key))

                    # If the person is associated to multiple boundaries.
                    if re.search(r'\AWards\b', membership.post.label):
                        for district_id in re.findall(r'\d+',
                                                      membership.post.label):
                            row = row[:]
                            row[0] = 'Ward {}'.format(district_id)
                            rows.append(row)
                    else:
                        rows.append(row)

                rows.sort()

                headers = self.default_headers[:]
                for _ in range(offices_count):
                    headers += self.office_headers

                name = metadata['name']
                if name in self.names:
                    slug = self.names[name]
                else:
                    slug = slugify(name)

                io = StringIO()
                body = csv.writer(io)
                body.writerow(headers)
                body.writerows(rows)
                key = 'csv/{}/{}.csv'.format(
                    'candidates' if candidates else 'representatives', slug)
                save(key, io)

                for row in rows:
                    row.insert(0, name)
            except ImportError:
                report.delete()  # delete reports for old modules

            return [rows, offices_count]
예제 #5
0
def represent(request, module_name):
    sys.path.append(os.path.abspath('scrapers'))

    metadata = module_name_to_metadata(module_name)

    representatives = []

    # Exclude party memberships.
    queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id'])

    if module_name.endswith('_candidates'):
        queryset.filter(role='candidate')
    else:
        queryset.exclude(role__in=('member', 'candidate'))

    for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources'):
        person = membership.person

        try:
            party_name = Membership.objects.get(organization__classification='party', role='member', person=person).organization.name
        except Membership.DoesNotExist:
            party_name = None

        if person.gender == 'male':
            gender = 'M'
        elif person.gender == 'female':
            gender = 'F'
        else:
            gender = None

        incumbent = person.extras.pop('incumbent', None)

        # @see https://represent.opennorth.ca/api/#fields
        representative = {
            'name': person.name,
            'elected_office': membership.role,
            'party_name': party_name,
            'email': next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None),
            'photo_url': person.image or None,
            'personal_url': get_personal_url(person),
            'gender': gender,
            'offices': json.dumps(get_offices(membership)),
            'extra': json.dumps(get_extra(person)),
        }

        # @see https://github.com/opennorth/represent-canada/issues/81
        sources = list(person.sources.all())
        if len(sources[0].url) <= 200:
            representative['source_url'] = sources[0].url

        if len(sources) > 1:
            representative['url'] = sources[-1].url

        if incumbent:
            representative['incumbent'] = True

        match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label)

        # If the person is part of Peel Regional Council.
        if match:
            parent = Division.objects.get(subtype1='csd', subtype2='', name=match.group(1))
            division = Division.objects.get(subtype1='csd', subid1=parent.subid1, name=match.group(2))
            boundary_set_slug = next((k for k, v in settings.IMAGO_BOUNDARY_MAPPINGS.items() if v['prefix'].startswith(parent.id)), None)
            representative['district_name'] = membership.post.label
            representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(boundary_set_slug, division.subid2)
            representatives.append(representative)
        # If the person is associated to multiple boundaries.
        elif re.search(r'^Wards\b', membership.post.label):
            for district_id in re.findall(r'\d+', membership.post.label):
                representative = representative.copy()
                representative['district_id'] = district_id
                representative['district_name'] = 'Ward %s' % district_id
                representatives.append(representative)
        else:
            division_id = metadata['division_id']
            if re.search('^ocd-division/country:ca/csd:(\d{7})\Z', division_id):
                geographic_code = division_id[-7:]
            elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z', division_id):
                geographic_code = division_id[-4:]
            else:
                geographic_code = None
            post_label = remove_suffix_re.sub('', membership.post.label)
            # If the post label is numeric.
            if re.search(r'^\d+\Z', post_label):
                representative['district_id'] = post_label
            # If the person has a boundary URL.
            elif membership.extras.get('boundary_url'):
                representative['district_name'] = post_label
                representative['boundary_url'] = membership.extras['boundary_url']
            # If the post label is a census subdivision.
            elif post_label == metadata['division_name'] and geographic_code:
                representative['district_name'] = post_label
                if len(geographic_code) == 7:
                    representative['boundary_url'] = '/boundaries/census-subdivisions/%s/' % geographic_code
                elif len(geographic_code) == 4:
                    representative['boundary_url'] = '/boundaries/census-divisions/%s/' % geographic_code
            else:
                representative['district_name'] = post_label
                district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z', post_label)
                if district_id:
                    representative['district_id'] = district_id.group(1)

            representatives.append(representative)

    return HttpResponse(json.dumps(representatives), content_type='application/json')
예제 #6
0
def represent(request, module_name):
    sys.path.append(os.path.abspath('scrapers'))

    metadata = module_name_to_metadata(module_name)

    representatives = []

    # Exclude party memberships.
    queryset = Membership.objects.filter(
        organization__jurisdiction_id=metadata['jurisdiction_id'])

    if module_name.endswith('_candidates'):
        queryset.filter(role='candidate')
    else:
        queryset.exclude(role__in=('member', 'candidate'))

    for membership in queryset.prefetch_related('contact_details', 'person',
                                                'person__links',
                                                'person__sources'):
        person = membership.person

        try:
            party_name = Membership.objects.get(
                organization__classification='party',
                role='member',
                person=person).organization.name
        except Membership.DoesNotExist:
            party_name = None

        if person.gender == 'male':
            gender = 'M'
        elif person.gender == 'female':
            gender = 'F'
        else:
            gender = None

        incumbent = person.extras.pop('incumbent', None)

        # @see https://represent.opennorth.ca/api/#fields
        representative = {
            'name':
            person.name,
            'elected_office':
            membership.role,
            'party_name':
            party_name,
            'email':
            next((contact_detail.value
                  for contact_detail in membership.contact_details.all()
                  if contact_detail.type == 'email'), None),
            'photo_url':
            person.image or None,
            'personal_url':
            get_personal_url(person),
            'gender':
            gender,
            'offices':
            json.dumps(get_offices(membership)),
            'extra':
            json.dumps(get_extra(person)),
        }

        # @see https://github.com/opennorth/represent-canada/issues/81
        sources = list(person.sources.all())
        if len(sources[0].url) <= 200:
            representative['source_url'] = sources[0].url

        if len(sources) > 1:
            representative['url'] = sources[-1].url

        if incumbent:
            representative['incumbent'] = True

        match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label)

        # If the person is part of Peel Regional Council.
        if match:
            parent = Division.objects.get(subtype1='csd',
                                          subtype2='',
                                          name=match.group(1))
            division = Division.objects.get(subtype1='csd',
                                            subid1=parent.subid1,
                                            name=match.group(2))
            boundary_set_slug = next(
                (k for k, v in settings.IMAGO_BOUNDARY_MAPPINGS.items()
                 if v['prefix'].startswith(parent.id)), None)
            representative['district_name'] = membership.post.label
            representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(
                boundary_set_slug, division.subid2)
            representatives.append(representative)
        # If the person is associated to multiple boundaries.
        elif re.search(r'^Wards\b', membership.post.label):
            for district_id in re.findall(r'\d+', membership.post.label):
                representative = representative.copy()
                representative['district_id'] = district_id
                representative['district_name'] = 'Ward %s' % district_id
                representatives.append(representative)
        else:
            division_id = metadata['division_id']
            if re.search('^ocd-division/country:ca/csd:(\d{7})\Z',
                         division_id):
                geographic_code = division_id[-7:]
            elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z',
                           division_id):
                geographic_code = division_id[-4:]
            else:
                geographic_code = None
            post_label = remove_suffix_re.sub('', membership.post.label)
            # If the post label is numeric.
            if re.search(r'^\d+\Z', post_label):
                representative['district_id'] = post_label
            # If the person has a boundary URL.
            elif membership.extras.get('boundary_url'):
                representative['district_name'] = post_label
                representative['boundary_url'] = membership.extras[
                    'boundary_url']
            # If the post label is a census subdivision.
            elif post_label == metadata['division_name'] and geographic_code:
                representative['district_name'] = post_label
                if len(geographic_code) == 7:
                    representative[
                        'boundary_url'] = '/boundaries/census-subdivisions/%s/' % geographic_code
                elif len(geographic_code) == 4:
                    representative[
                        'boundary_url'] = '/boundaries/census-divisions/%s/' % geographic_code
            else:
                representative['district_name'] = post_label
                district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z',
                                        post_label)
                if district_id:
                    representative['district_id'] = district_id.group(1)

            representatives.append(representative)

    return HttpResponse(json.dumps(representatives),
                        content_type='application/json')
예제 #7
0
    def handle(self, *args, **options):
        def save(key, body):
            k = Key(bucket)
            k.key = key
            k.set_contents_from_string(body)
            k.set_acl('public-read')

        sys.path.append(os.path.abspath('scrapers'))

        bucket = S3Connection().get_bucket('represent.opennorth.ca')

        names = {
            'Parliament of Canada': 'house-of-commons',
            'Legislative Assembly of Alberta': 'alberta-legislature',
            'Legislative Assembly of British Columbia': 'bc-legislature',
            'Legislative Assembly of Manitoba': 'manitoba-legislature',
            'Legislative Assembly of New Brunswick': 'new-brunswick-legislature',
            'Newfoundland and Labrador House of Assembly': 'newfoundland-labrador-legislature',
            'Nova Scotia House of Assembly': 'nova-scotia-legislature',
            'Legislative Assembly of Ontario': 'ontario-legislature',
            'Legislative Assembly of Prince Edward Island': 'pei-legislature',
            'Assemblée nationale du Québec': 'quebec-assemblee-nationale',
            'Legislative Assembly of Saskatchewan': 'saskatchewan-legislature',
        }

        default_headers = [
            'District name',
            'Primary role',
            'Name',  # not in CSV schema
            'First name',
            'Last name',
            'Gender',
            'Party name',
            'Email',
            'Photo URL',
            'Source URL',
            'Website',
            'Facebook',
            'Instagram',
            'Twitter',
            'LinkedIn',
            'YouTube',
        ]
        office_headers = [
            'Office type',  # not in CSV schema
            'Address',  # not in CSV schema
            'Phone',
            'Fax',
        ]

        all_rows = []
        max_offices_count = 0

        reports = Report.objects.filter(exception='').exclude(module__endswith='_candidates').exclude(module__endswith='_municipalities').order_by('module')
        for report in reports:
            try:
                metadata = module_name_to_metadata(report.module)

                rows = []
                offices_count = 0

                # Exclude party memberships.
                queryset = Membership.objects.filter(organization__jurisdiction_id=metadata['jurisdiction_id']).exclude(role__in=('member', 'candidate'))
                for membership in queryset.prefetch_related('contact_details', 'person', 'person__links', 'person__sources'):
                    person = membership.person

                    try:
                        party_name = Membership.objects.get(organization__classification='party', role='member', person=person).organization.name
                    except Membership.DoesNotExist:
                        party_name = None

                    facebook = None
                    instagram = None
                    linkedin = None
                    twitter = None
                    youtube = None
                    for link in person.links.all():
                        domain = '.'.join(urlsplit(link.url).netloc.split('.')[-2:])
                        if domain in ('facebook.com', 'fb.com'):
                            facebook = link.url
                        elif domain == 'instagram.com':
                            instagram = link.url
                        elif domain == 'linkedin.com':
                            linkedin = link.url
                        elif domain == 'twitter.com':
                            twitter = link.url
                        elif domain == 'youtube.com':
                            youtube = link.url

                    if person.gender == 'male':
                        gender = 'M'
                    elif person.gender == 'female':
                        gender = 'F'
                    else:
                        gender = None

                    if ' ' in person.name:
                        first_name, last_name = person.name.rsplit(' ', 1)
                    else:
                        first_name, last_name = None, person.name

                    # @see https://represent.opennorth.ca/api/#fields
                    sources = list(person.sources.all())
                    row = [
                        remove_suffix_re.sub('', membership.post.label),  # District name
                        membership.role,  # Elected office
                        person.name,  # Name
                        first_name,  # First name
                        last_name,  # Last name
                        gender,  # Gender
                        party_name,  # Party name
                        next((contact_detail.value for contact_detail in membership.contact_details.all() if contact_detail.type == 'email'), None),  # Email
                        person.image,  # Photo URL
                        sources[-1].url if len(sources) > 1 else None,  # Source URL
                        get_personal_url(person),  # Website
                        facebook,  # Facebook
                        instagram,  # Instagram
                        twitter,  # Twitter
                        linkedin,  # LinkedIn
                        youtube,  # YouTube
                    ]

                    offices = get_offices(membership)
                    if len(offices) > offices_count:
                        offices_count = len(offices)

                    for office in offices:
                        for key in ('type', 'postal', 'tel', 'fax'):
                            row.append(office.get(key))

                    # If the person is associated to multiple boundaries.
                    if re.search(r'\AWards\b', membership.post.label):
                        for district_id in re.findall(r'\d+', membership.post.label):
                            row = row[:]
                            row[0] = 'Ward %s' % district_id
                            rows.append(row)
                    else:
                        rows.append(row)

                rows.sort()

                headers = default_headers[:]
                for _ in range(offices_count):
                    headers += office_headers

                name = metadata['name']
                if name in names:
                    slug = names[name]
                else:
                    slug = slugify(name)

                io = StringIO()
                body = csv.writer(io)
                body.writerow(headers)
                body.writerows(rows)
                save('csv/%s.csv' % slug, codecs.encode(io.getvalue(), 'windows-1252'))

                if offices_count > max_offices_count:
                    max_offices_count = offices_count

                for row in rows:
                    row.insert(0, name)
                    all_rows.append(row)
            except ImportError:
                report.delete()  # delete reports for old modules

        headers = ['Organization'] + default_headers
        for _ in range(max_offices_count):
            headers += office_headers

        io = StringIO()
        body = csv.writer(io)
        body.writerow(headers)
        body.writerows(all_rows)
        save('csv/complete.csv', codecs.encode(io.getvalue(), 'windows-1252'))
예제 #8
0
def represent(request, module_name):
    sys.path.append(os.path.abspath('scrapers'))

    metadata = module_name_to_metadata(module_name)

    representatives = []

    # Exclude party memberships.
    queryset = Membership.objects.filter(
        organization__jurisdiction_id=metadata['jurisdiction_id'])

    if module_name.endswith('_candidates'):
        # Include only candidates.
        queryset.filter(role='candidate')
    else:
        # Exclude candidates and party memberships.
        queryset.exclude(role__in=('member', 'candidate'))

    for membership in queryset.prefetch_related('contact_details', 'person',
                                                'person__links',
                                                'person__sources', 'post'):
        person = membership.person

        # Not sure why this is necessary.
        if not isinstance(membership.extras, dict):
            membership.extras = json.loads(membership.extras)
        if not isinstance(person.extras, dict):
            person.extras = json.loads(person.extras)

        try:
            party_name = Membership.objects.select_related('organization').get(
                organization__classification='party',
                role='member',
                person=person).organization.name
        except Membership.DoesNotExist:
            party_name = None

        if person.gender == 'male':
            gender = 'M'
        elif person.gender == 'female':
            gender = 'F'
        else:
            gender = None

        # Candidates only.
        incumbent = person.extras.pop('incumbent', None)

        # @see https://represent.opennorth.ca/api/#fields
        representative = {
            'name':
            person.name,
            'elected_office':
            membership.role,
            'party_name':
            party_name,
            'email':
            next((contact_detail.value
                  for contact_detail in membership.contact_details.all()
                  if contact_detail.type == 'email'), None),
            'photo_url':
            person.image or None,
            'personal_url':
            get_personal_url(person),
            'gender':
            gender,
            'offices':
            json.dumps(get_offices(membership)),
            'extra':
            json.dumps(get_extra(person)),
        }

        sources = list(person.sources.all())

        # The first URL ought to be the most generic source.
        representative['source_url'] = sources[0].url

        if len(sources) > 1:
            # The last URL ought to be the most specific source.
            representative['url'] = sources[-1].url

        if incumbent:
            representative['incumbent'] = True

        match = re.search(r'^(\S+) (Ward \d+)$', membership.post.label)

        # If the person is part of Peel Regional Council.
        if match:
            parent = Division.objects.get(subtype1='csd',
                                          subtype2='',
                                          name=match.group(1))
            division = Division.objects.get(subtype1='csd',
                                            subid1=parent.subid1,
                                            name=match.group(2))
            boundary_set_slug = '{}-wards'.format(parent.name.lower())
            representative['district_name'] = membership.post.label
            representative['boundary_url'] = '/boundaries/{}/ward-{}/'.format(
                boundary_set_slug, division.subid2)
            representatives.append(representative)

        # If the person is associated to multiple boundaries.
        elif re.search(r'^Wards\b', membership.post.label):
            for district_id in re.findall(r'\d+', membership.post.label):
                representative = representative.copy()
                representative['district_id'] = district_id
                representative['district_name'] = 'Ward {}'.format(district_id)
                representatives.append(representative)

        else:
            division_id = metadata['division_id']

            if re.search('^ocd-division/country:ca/csd:(\d{7})\Z',
                         division_id):
                geographic_code = division_id[-7:]
            elif re.search('^ocd-division/country:ca/cd:(\d{4})\Z',
                           division_id):
                geographic_code = division_id[-4:]
            else:
                geographic_code = None

            post_label = remove_suffix_re.sub('', membership.post.label)

            # If the post label is numeric.
            if re.search(r'^\d+\Z', post_label):
                representative['district_id'] = post_label

            # If the person has a boundary URL.
            elif 'boundary_url' in membership.extras:
                representative['district_name'] = post_label
                representative['boundary_url'] = membership.extras[
                    'boundary_url']

            # If the post label is a Census geographic name.
            elif post_label == metadata['division_name'] and geographic_code:
                representative['district_name'] = post_label
                if len(geographic_code) == 7:
                    representative[
                        'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(
                            geographic_code)
                elif len(geographic_code) == 4:
                    representative[
                        'boundary_url'] = '/boundaries/census-divisions/{}/'.format(
                            geographic_code)

            else:
                representative['district_name'] = post_label
                district_id = re.search(r'^(?:District|Division|Ward) (\d+)\Z',
                                        post_label)
                if district_id:
                    representative['district_id'] = district_id.group(1)

            representatives.append(representative)

    return HttpResponse(json.dumps(representatives),
                        content_type='application/json')