def test_any_constituency_csv(self):
     self.create_memberships(self.ballot, self.parties)
     url = "{}.csv".format(self.ballot.get_absolute_url().rstrip("/"))
     response = self.app.get(url)
     self.assertEqual(response.status_code, 200)
     row_dicts = [row for row in BufferDictReader(response.content)]
     self.assertEqual(9, len(row_dicts))
     membership = self.ballot.membership_set.order_by("person__pk").first()
     self.maxDiff = None
     self.assertDictEqual(
         dict(row_dicts[0]),
         {
             "blog_url": "",
             "birth_date": "",
             "cancelled_poll": "False",
             "elected": "",
             "election": self.ballot.election.slug,
             "election_current": "True",
             "election_date":
             self.ballot.election.election_date.isoformat(),
             "email": "",
             "facebook_page_url": "",
             "facebook_personal_url": "",
             "favourite_biscuits": "",
             "gender": "",
             "gss_code": "",
             "homepage_url": "",
             "honorific_prefix": "",
             "honorific_suffix": "",
             "id": str(membership.person.pk),
             "image_copyright": "",
             "image_uploading_user": "",
             "image_uploading_user_notes": "",
             "image_url": "",
             "instagram_url": "",
             "linkedin_url": "",
             "mapit_url": "",
             "name": membership.person.name,
             "old_person_ids": "",
             "parlparse_id": "",
             "party_ec_id": membership.party.ec_id,
             "party_id": membership.party.legacy_slug,
             "party_lists_in_use": "False",
             "party_list_position": "",
             "party_name": membership.party.name,
             "party_ppc_page_url": "",
             "post_id": self.ballot.post.slug,
             "post_label": self.ballot.post.label,
             "proxy_image_url_template": "",
             "theyworkforyou_url": "",
             "twitter_username": "",
             "twitter_user_id": "",
             "wikipedia_url": "",
             "wikidata_id": "",
             "youtube_profile": "",
         },
     )
Пример #2
0
 def test_any_constituency_csv(self):
     url = "{}.csv".format(
         self.dulwich_post_pee.get_absolute_url().rstrip("/"))
     response = self.app.get(url)
     self.assertEqual(response.status_code, 200)
     row_dicts = [row for row in BufferDictReader(response.content)]
     self.assertEqual(2, len(row_dicts))
     self.assertDictEqual(
         dict(row_dicts[1]),
         {
             "birth_date": "",
             "cancelled_poll": "False",
             "elected": "",
             "election": "parl.2015-05-07",
             "election_current": "True",
             "election_date": text_type(date_in_near_future),
             "email": "",
             "facebook_page_url": "",
             "facebook_personal_url": "",
             "favourite_biscuits": "",
             "gender": "",
             "gss_code": "",
             "homepage_url": "",
             "honorific_prefix": "",
             "honorific_suffix": "",
             "id": "2009",
             "image_copyright": "",
             "image_uploading_user": "",
             "image_uploading_user_notes": "",
             "image_url": "",
             "linkedin_url": "",
             "mapit_url": "",
             "name": "Tessa Jowell",
             "old_person_ids": "",
             "parlparse_id": "",
             "party_ec_id": "PP53",
             "party_id": "party:53",
             "party_lists_in_use": "False",
             "party_list_position": "",
             "party_name": "Labour Party",
             "party_ppc_page_url": "",
             "post_id": "65808",
             "post_label": "Dulwich and West Norwood",
             "proxy_image_url_template": "",
             "theyworkforyou_url": "",
             "twitter_username": "",
             "twitter_user_id": "",
             "wikipedia_url": "",
             "wikidata_url": "",
         },
     )
 def handle(self, *args, **options):
     csv_url, = args
     r = requests.get(csv_url)
     r.encoding = 'utf-8'
     reader = BufferDictReader(r.text)
     for line in reader:
         cleaned_line = {}
         for k,v in line.items():
             cleaned_line[k] = strip(v)
         if not cleaned_line['Election ID']:
             continue
         if not cleaned_line['GSS Code']:
             continue
         self.process_line(cleaned_line)
Пример #4
0
 def test_any_constituency_csv(self):
     response = self.app.get(
         '/election/2015/post/65808/dulwich-and-west-norwood.csv', )
     row_dicts = [row for row in BufferDictReader(response.content)]
     self.assertEqual(1, len(row_dicts))
     self.assertEqual(
         row_dicts[0], {
             'birth_date': '',
             'elected': '',
             'election': '2015',
             'election_current': 'True',
             'election_date': text_type(date_in_near_future),
             'email': '',
             'facebook_page_url': '',
             'facebook_personal_url': '',
             'favourite_biscuits': '',
             'gender': '',
             'gss_code': '',
             'homepage_url': '',
             'honorific_prefix': '',
             'honorific_suffix': '',
             'id': '2009',
             'image_copyright': '',
             'image_uploading_user': '',
             'image_uploading_user_notes': '',
             'image_url': '',
             'linkedin_url': '',
             'mapit_url': '',
             'name': 'Tessa Jowell',
             'old_person_ids': '',
             'parlparse_id': '',
             'party_ec_id': 'PP53',
             'party_id': 'party:53',
             'party_lists_in_use': 'False',
             'party_list_position': '',
             'party_name': 'Labour Party',
             'party_ppc_page_url': '',
             'post_id': '65808',
             'post_label': 'Dulwich and West Norwood',
             'proxy_image_url_template': '',
             'theyworkforyou_url': '',
             'twitter_username': '',
             'twitter_user_id': '',
             'wikipedia_url': '',
         })
    def handle(self, *args, **options):

        csv_url, = args

        override_election = None
        override_election_slug = options['election']
        if override_election_slug:
            try:
                override_election = Election.objects.get(
                    slug=override_election_slug)
            except Election.DoesNotExist:
                msg = 'No election with slug {0} found'
                raise CommandError(msg.format(override_election_slug))

        election_name_to_election = {}

        mime_type_magic = magic.Magic(mime=True)
        storage = FileSystemStorage()

        r = requests.get(csv_url)
        r.encoding = 'utf-8'
        reader = BufferDictReader(r.text)
        for row in reader:
            post_or_area_header = get_column_header(
                POST_OR_AREA_COLUMN_HEADERS_TO_TRY, row)

            name = row[post_or_area_header]
            if not name:
                continue
            name = name.strip()

            # If there was no election specified, try to find it from
            # the 'Election' column (which has the election name):
            if override_election_slug:
                election = override_election
            else:
                if 'Election' not in row:
                    raise CommandError(
                        "There is no election name in the 'Election' column, so you must supply an election slug with --election"
                    )
                election_name = row['Election']
                election = election_name_to_election.get(election_name)
                if election is None:
                    election = Election.objects.get(name=election_name)
                    election_name_to_election[election_name] = election

            try:
                post = Post.objects.get(
                    label=name,
                    extra__elections=election,
                )
            except Post.DoesNotExist:
                msg = "Failed to find the post {0}, guessing it might be the area name instead"
                print(msg.format(name))
                # If the post name isn't there, try getting it from
                # the area:
                try:
                    area = Area.objects.get(name=name)
                except Area.DoesNotExist:
                    print("Failed to find area for {0}".format(name))
                    continue

                try:
                    post = Post.objects.get(area=area)
                except Post.DoesNotExist:
                    print("Failed to find post with for {0}".format(name))
                    continue

            # Check that the post is actually valid for this election:
            if election not in post.extra.elections.all():
                msg = "The post {post} wasn't in the election {election}"
                raise CommandError(
                    msg.format(post=post.label, election=election.name))

            document_url_column = get_column_header(PDF_COLUMN_HEADERS_TO_TRY,
                                                    row)
            document_url = row[document_url_column]
            if not document_url:
                print("No URL for {0}".format(name))
                continue
            existing_documents = OfficialDocument.objects.filter(
                document_type=OfficialDocument.NOMINATION_PAPER,
                post_id=post,
            )
            if existing_documents.count() > 0:
                if options['delete_existing']:
                    print("Removing existing documents")
                    existing_documents.delete()
                else:
                    print("Skipping {0} since it already had documents".format(
                        name))
                    continue
            try:
                downloaded_filename = download_file_cached(document_url)
            except requests.exceptions.ConnectionError:
                print("Connection failed for {0}".format(name))
                print("The URL was:", document_url)
                continue
            except requests.exceptions.MissingSchema:
                # This is probably someone putting notes in the URL
                # column, so ignore:
                print("Probably not a document URL for {0}: {1}".format(
                    name, document_url))
                continue
            mime_type = mime_type_magic.from_file(downloaded_filename)
            extension = mimetypes.guess_extension(mime_type)
            if mime_type not in allowed_mime_types:
                print("Ignoring unknown MIME type {0} for {1}".format(
                    mime_type,
                    name,
                ))
                continue
            filename = "official_documents/{post_id}/statement-of-persons-nominated{extension}".format(
                post_id=post.extra.slug,
                extension=extension,
            )
            with open(downloaded_filename, 'rb') as f:
                storage_filename = storage.save(filename, f)

            OfficialDocument.objects.create(
                document_type=OfficialDocument.NOMINATION_PAPER,
                uploaded_file=storage_filename,
                election=election,
                post=post,
                source_url=document_url)
            message = "Successfully added the Statement of Persons Nominated for {0}"
            print(message.format(name))
    def handle(self, *args, **options):

        csv_url = options["url"]

        mime_type_magic = magic.Magic(mime=True)
        storage = DefaultStorage()

        r = requests.get(csv_url)
        r.encoding = "utf-8"
        reader = BufferDictReader(r.text)
        for row in reader:
            pee = PostExtraElection.objects.get(
                ballot_paper_id=row["ballot_paper_id"])
            document_url = row["Link to PDF"]

            if not document_url:
                # print("No URL for {0}".format(name))
                continue
            existing_documents = OfficialDocument.objects.filter(
                document_type=OfficialDocument.NOMINATION_PAPER,
                post_election=pee,
            )
            if existing_documents.count() > 0:
                if options["delete_existing"]:
                    print("Removing existing documents")
                    existing_documents.delete()
                else:
                    msg = "Skipping {0} since it already had documents for {1}"
                    # print(msg.format(name, election))
                    continue
            try:
                downloaded_filename = download_file_cached(document_url)
            except requests.exceptions.ConnectionError:
                print("Connection failed for {}".format(
                    row["ballot_paper_id"]))
                print("The URL was:", document_url)
                continue
            except requests.exceptions.MissingSchema:
                # This is probably someone putting notes in the URL
                # column, so ignore:
                print("Probably not a document URL for {}: {}".format(
                    row["ballot_paper_id"], document_url))
                continue

            mime_type = mime_type_magic.from_file(downloaded_filename).decode(
                "utf8")
            extension = mimetypes.guess_extension(mime_type)

            if mime_type not in allowed_mime_types:
                recovered = False
                # Attempt to get a PDF link form the URL
                ignore_urls = ["drive.google.com"]
                if not any([x in document_url for x in ignore_urls]):
                    try:

                        req = requests.get(document_url,
                                           headers=headers,
                                           verify=False)
                        if req.status_code == 200:
                            re_sre = r'(http[^"\']+\.pdf)'
                            matches = re.findall(re_sre, req.content)
                            if len(matches) == 1:
                                document_url = matches[0]

                            downloaded_filename = download_file_cached(
                                document_url)
                            mime_type = mime_type_magic.from_file(
                                downloaded_filename)

                            extension = mimetypes.guess_extension(mime_type)
                            if mime_type not in allowed_mime_types:
                                raise ValueError(
                                    "Recovery failed to get a PDF for {}".
                                    format(pee.ballot_paper_id))
                            else:
                                recovered = True
                    except Exception as e:
                        print(e)

                else:
                    print("Ignoring unknown MIME type {} for {}".format(
                        mime_type, pee.ballot_paper_id))
                if not recovered:
                    continue

            filename = "official_documents/{ballot_paper_id}/statement-of-persons-nominated{extension}".format(
                ballot_paper_id=pee.ballot_paper_id, extension=extension)

            if not extension:
                raise ValueError("unknown extension")
            with open(downloaded_filename, "rb") as f:
                storage_filename = storage.save(filename, f)

            OfficialDocument.objects.create(
                document_type=OfficialDocument.NOMINATION_PAPER,
                uploaded_file=storage_filename,
                post_election=pee,
                source_url=document_url,
            )
            message = (
                "Successfully added the Statement of Persons Nominated for {0}"
            )
            print(message.format(pee.ballot_paper_id))
            extract_and_parse_tables_for_ballot.delay(pee.ballot_paper_id)