示例#1
0
    def download_and_analyze_file(self, file_id: int,
                                  address_pipeline: AddressPipeline,
                                  fallback_city: str) -> bool:
        """
        Downloads and analyses a single file, i.e. extracting text, locations and persons.

        Returns False for http errors on downloading and True otherwise.
        """
        file = File.objects.get(id=file_id)
        url = file.get_oparl_url()

        with NamedTemporaryFile() as tmpfile:
            try:
                content, content_type = self.loader.load_file(url)
                if content_type and file.mime_type and content_type != file.mime_type:
                    logger.warning(
                        "Diverging mime types: Expected {}, got {}".format(
                            file.mime_type, content_type))
                file.mime_type = content_type or file.mime_type
                tmpfile.write(content)
                tmpfile.file.seek(0)
                file.filesize = len(content)
            except HTTPError:
                logger.exception("File {}: Failed to download {}".format(
                    file.id, url))
                return False

            logger.debug("File {}: Downloaded {} ({}, {})".format(
                file.id, url, file.mime_type, filesizeformat(file.filesize)))

            minio_client().put_object(
                minio_file_bucket,
                str(file.id),
                tmpfile.file,
                file.filesize,
                content_type=file.mime_type,
            )

            # If the api has text, keep that
            if self.download_files and not file.parsed_text:
                file.parsed_text, file.page_count = extract_from_file(
                    tmpfile.file, tmpfile.name, file.mime_type, file.id)

        if file.parsed_text:
            locations = extract_locations(file.parsed_text,
                                          pipeline=address_pipeline,
                                          fallback_city=fallback_city)
            file.locations.set(locations)
            persons = extract_persons(file.name + "\n" +
                                      (file.parsed_text or "") + "\n")
            file.mentioned_persons.set(persons)
            logger.debug("File {}: Found {} locations and {} persons".format(
                file.id, len(locations), len(persons)))
        else:
            logger.warning("File {}: Couldn't get any text".format(file.id))

        db.connections.close_all()
        file.save()

        return True
    def remove_pgp_key(self):
        # If the user clicks "remove" when the key is already removed, we can ignore that
        if not self.pgp_key_fingerprint:
            return

        minio_client().remove_object(minio_pgp_keys_bucket, self.pgp_key_fingerprint)

        self.pgp_key_fingerprint = None
        self.save()
    def add_pgp_key(self, pgp_key_fingerprint: str, pgp_key: str):
        """ This should eventually be abstracted away into a file manager class """
        key_bytes = pgp_key.encode()
        minio_client().put_object(
            minio_pgp_keys_bucket,
            pgp_key_fingerprint,
            BytesIO(key_bytes),
            len(key_bytes),
        )

        self.pgp_key_fingerprint = pgp_key_fingerprint
        self.save()
def test_manual_deletion(pytestconfig):
    """Check that after a file has been manually deleted, it can't get re-imported and it's gone from minio"""
    url = "https://example.org/file/1"
    file_id = 1
    sample_file = File(
        name="Bad File",
        original_id=file_id,
        url=url,
        claimed_size=None,
        paper_original_id=sample_paper.original_id,
    )
    data = RisData(sample_city, None, [], [], [sample_paper], [sample_file],
                   [], [], [], 2)
    body = Body(name=data.meta.name,
                short_name=data.meta.name,
                ags=data.meta.ags)
    body.save()
    import_data(body, data)

    with responses.RequestsMock() as requests_mock:
        requests_mock.add(
            responses.GET,
            url,
            body=Path(pytestconfig.rootdir).joinpath(
                "testdata/media/file.txt").read_bytes(),
            status=200,
            content_type="text/plain",
        )
        importer = Importer(BaseLoader({}), force_singlethread=True)
        [successful, failed] = importer.load_files(sample_city.name)
        assert successful == 1 and failed == 0

    # Ensure that the file is there
    assert minio_client().get_object(minio_file_bucket, str(file_id))
    assert models.File.objects.filter(pk=file_id).first()

    # This is what we test
    models.File.objects.get(pk=file_id).manually_delete()

    with pytest.raises(MinioException):
        minio_client().get_object(minio_file_bucket, str(file_id))

    # Another import, to ensure that manually delete is respected
    import_data(body, data)

    assert not models.File.objects.filter(pk=file_id).first()
    with responses.RequestsMock():
        importer = Importer(BaseLoader({}), force_singlethread=True)
        [successful, failed] = importer.load_files(sample_city.name)
        assert successful == 0 and failed == 0

    with pytest.raises(MinioException):
        minio_client().get_object(minio_file_bucket, str(file_id))
def test_manual_deletion(pytestconfig, caplog):
    """Check that after a file has been manually deleted, it can't get re-imported and it's gone from minio"""
    url = "https://example.org/file/1"
    file_id = 1
    body, data = make_sample_file(file_id, url)

    with responses.RequestsMock() as requests_mock:
        requests_mock.add(
            responses.GET,
            url,
            body=pytestconfig.rootpath.joinpath(
                "testdata/media/file.txt").read_bytes(),
            status=200,
            content_type="text/plain",
        )
        importer = Importer(BaseLoader({}), force_singlethread=True)
        [successful, failed] = importer.load_files(sample_city.name)
        assert successful == 1 and failed == 0

    # Ensure that the file is there
    assert minio_client().get_object(minio_file_bucket, str(file_id))
    assert models.File.objects.filter(pk=file_id).first()

    # This is what we test
    models.File.objects.get(pk=file_id).manually_delete()

    with pytest.raises(MinioException):
        minio_client().get_object(minio_file_bucket, str(file_id))

    # Another import, to ensure that manually delete is respected
    import_data(body, data)

    assert not models.File.objects.filter(pk=file_id).first()
    with responses.RequestsMock():
        importer = Importer(BaseLoader({}), force_singlethread=True)
        [successful, failed] = importer.load_files(sample_city.name)
        assert successful == 0 and failed == 0

    with pytest.raises(MinioException):
        minio_client().get_object(minio_file_bucket, str(file_id))

    assert caplog.messages == [
        "File 1 has an unknown mime type: 'text/plain'",
        "File 1: Couldn't get any text",
    ]
    def get_pgp_key(self) -> Optional[bytes]:
        """ Returns fingerprint and key """
        if not self.pgp_key_fingerprint:
            return None

        return (
            minio_client()
            .get_object(minio_pgp_keys_bucket, self.pgp_key_fingerprint)
            .read()
        )
def file_serve(request, id):
    logger.warning("Serving media files through django is slow")
    minio_file = minio_client().get_object(minio_file_bucket, id)
    response = HttpResponse(minio_file.read())

    response["Content-Type"] = minio_file.headers["Content-Type"]

    if settings.SITE_SEO_NOINDEX:
        response["X-Robots-Tag"] = "noindex"

    return response
    def handle(self, *args, **options):
        for name, obj in inspect.getmembers(models):
            if (not inspect.isclass(obj) or not issubclass(obj, Model)
                    or name in ["DefaultFields"]):
                continue
            self.stdout.write(f"{name}: {obj.objects.count()}")
        files_total = File.objects.count()
        files_with_text = File.objects.filter(
            parsed_text__isnull=False).count()
        files_with_location = (File.objects.annotate(
            location_count=Count("locations")).filter(
                location_count__gte=1).count())
        files_with_persons = (File.objects.annotate(
            persons_count=Count("mentioned_persons")).filter(
                persons_count__gte=1).count())
        files_not_downloaded = File.objects.filter(
            filesize__isnull=True, oparl_access_url__isnull=False).count()
        files_without_url = File.objects.filter(
            oparl_access_url__isnull=True).count()
        self.stdout.write(
            f"Files total: {files_total}; with text: {files_with_text}; "
            f"with locations: {files_with_location}; with persons: {files_with_persons}; "
            f"not downloaded: {files_not_downloaded}; without url: {files_without_url}"
        )
        bodies_with_outline = Body.objects.filter(
            outline__isnull=False).count()
        bodies_with_ags = Body.objects.filter(ags__isnull=False).count()
        self.stdout.write(
            f"Bodies with an outline: {bodies_with_outline}; with an ags: {bodies_with_ags}"
        )

        users_with_alerts = UserAlert.objects.values("user").distinct().count()
        users = User.objects.count()
        alerts = UserAlert.objects.count()
        self.stdout.write(
            f"There are {alerts} alerts by {users_with_alerts} of {users} users"
        )

        # Check if there are files which are listed as imported but aren't in minio
        # We convert everything to strings because there might be non-numeric files in minio
        existing_files = set(
            file.object_name
            for file in minio_client().list_objects(minio_file_bucket))
        expected_files = set(
            str(i) for i in File.objects.filter(
                filesize__gt=0).values_list("id", flat=True))
        missing_files = len(expected_files - existing_files)
        if missing_files > 0:
            self.stdout.write(
                f"{missing_files} files are marked as imported but aren't available in minio"
            )
def file_serve(request, id):
    logger.warning("Serving media files through django is slow")
    """ Ensure that the file is not deleted in the database """
    get_object_or_404(File, id=id)

    minio_file = minio_client().get_object(minio_file_bucket, id)
    response = HttpResponse(minio_file.read())

    response["Content-Type"] = minio_file.headers["Content-Type"]

    if settings.SITE_SEO_NOINDEX:
        response["X-Robots-Tag"] = "noindex"

    return response
示例#10
0
 def handle(self, *args, **options):
     existing_files = set(
         int(file.object_name)
         for file in minio_client().list_objects(minio_file_bucket)
     )
     expected_files: Set[int] = set(
         File.objects.filter(filesize__gt=0).values_list("id", flat=True)
     )
     missing_files = expected_files - existing_files
     if len(missing_files) > 0:
         self.stdout.write(
             f"{missing_files} files are marked as imported but aren't available in minio"
         )
         File.objects.filter(id__in=missing_files).update(filesize=None)
示例#11
0
 def parse_file(self, file: File, fallback_city: str):
     logging.info("- Parsing: " + str(file.id) + " (" + file.name + ")")
     with minio_client().get_object(minio_file_bucket,
                                    str(file.id)) as file_handle:
         recognized_text = get_ocr_text_from_pdf(file_handle.read())
     if len(recognized_text) > 0:
         file.parsed_text = cleanup_extracted_text(recognized_text)
         file.mentioned_persons = extract_persons(file.name + "\n" +
                                                  (recognized_text or "") +
                                                  "\n")
         file.locations.set(
             extract_locations(file.parsed_text, fallback_city))
         file.save()
     else:
         logging.warning("Nothing recognized")
 def manually_delete(self):
     """Sometimes we need to delete files even if they were not deleted at the source"""
     self.deleted = True
     self.manually_deleted = True
     self.save()
     minio_client().remove_object(minio_file_bucket, str(self.id))
示例#13
0
    def download_and_analyze_file(self, file_id: int,
                                  address_pipeline: AddressPipeline,
                                  fallback_city: str) -> bool:
        """
        Downloads and analyses a single file, i.e. extracting text, locations and persons.

        Returns False for http errors on downloading and True otherwise.
        """
        file = File.objects.get(id=file_id)
        url = file.get_oparl_url()

        with NamedTemporaryFile() as tmp_file:
            try:
                content, content_type = self.loader.load_file(url)
                if content_type and file.mime_type and content_type != file.mime_type:
                    logger.warning(
                        "Diverging mime types: Expected {}, got {}".format(
                            file.mime_type, content_type))
                if content_type and content_type.split(";")[0] == "text/html":
                    logger.error(
                        f"File {file.id}: Content type was {content_type}, this seems to be a silent error"
                    )
                    return False
                file.mime_type = content_type or file.mime_type
                tmp_file.write(content)
                tmp_file.file.seek(0)
                file.filesize = len(content)
            except RequestException as e:
                # Normal server error
                if e.response and 400 <= e.response.status_code < 600:
                    logger.error(
                        f"File {file.id}: Failed to download {url} with error {e.response.status_code}"
                    )
                else:
                    logger.exception(
                        f"File {file.id}: Failed to download {url}")
                return False

            logger.debug("File {}: Downloaded {} ({}, {})".format(
                file.id, url, file.mime_type, filesizeformat(file.filesize)))

            if not settings.PROXY_ONLY_TEMPLATE:
                minio_client().put_object(
                    minio_file_bucket,
                    str(file.id),
                    tmp_file.file,
                    file.filesize,
                    content_type=file.mime_type,
                )

            # If the api has text, keep that
            if self.download_files and not file.parsed_text:
                file.parsed_text, file.page_count = extract_from_file(
                    tmp_file.file, tmp_file.name, file.mime_type, file.id)

        if file.parsed_text:
            locations = extract_locations(file.parsed_text,
                                          pipeline=address_pipeline,
                                          fallback_city=fallback_city)
            file.locations.set(locations)
            persons = extract_persons(file.name + "\n" +
                                      (file.parsed_text or "") + "\n")
            file.mentioned_persons.set(persons)
            logger.debug("File {}: Found {} locations and {} persons".format(
                file.id, len(locations), len(persons)))
        else:
            logger.warning(f"File {file.id}: Couldn't get any text")

        try:
            db.connections.close_all()
            file.save()
        except (ElasticsearchException, DatabaseError) as e:
            logger.exception(f"File {file.id}: Failed to save: {e}")
            return False

        return True