def test_is_file(self):
     """Test if_file function with empty files,existing files and not existing files."""
     open('new_empty_file.txt', 'a').close()
     self.assertEqual(is_file('new_empty_file', False), False)
     os.remove('new_empty_file.txt')
     self.assertEqual(is_file('not_existing_file.txt', False), False)
     self.assertEqual(is_file(TEST_PDF_FILE, False), True)
예제 #2
0
    def get_templates(self, update=False):
        """
        Get ID and name of any Template in the project.

        :param update: Update the downloaded information even it is already available
        :return: Templates in the project.
        """
        if not self.templates or update:
            self.templates_file_path = os.path.join(self.data_root,
                                                    'templates.json5')
            if not is_file(self.templates_file_path,
                           raise_exception=False) or update:
                templates_data = get_project_templates(session=self.session)
                if templates_data:
                    # the text of a document can be None
                    with open(self.templates_file_path, 'w') as f:
                        json.dump(templates_data, f, indent=2, sort_keys=True)
            else:
                with open(self.templates_file_path, 'r') as f:
                    templates_data = json.load(f)

            for template_data in templates_data:
                self.template_class(project=self, **template_data)

        # Make default_template an Template instance
        for template in self.templates:
            if isinstance(template.default_template, int):
                template.default_template = self.get_template_by_id(
                    template.default_template)

        return self.templates
 def test_project(self):
     """Test basic properties of the project object."""
     assert is_file(self.prj.meta_file_path)
     assert self.prj.documents[1].id > self.prj.documents[0].id
     assert len(self.prj.documents)
     # check if we can initialize a new project object, which will use the same data
     assert len(self.prj.documents) == self.document_count
     new_project = Project()
     assert len(new_project.documents) == self.correct_document_count
     assert new_project.meta_file_path == self.prj.meta_file_path
예제 #4
0
    def clean_meta(self):
        """Clean the meta-information about the Project, Labels, and Templates."""
        if self.meta_file_path:
            os.remove(self.meta_file_path)
        assert not is_file(self.meta_file_path, raise_exception=False)
        self.meta_data = None
        self.meta_file_path = None

        if self.labels_file_path:
            os.remove(self.labels_file_path)
        assert not is_file(self.labels_file_path, raise_exception=False)
        self.labels_file_path = None
        self.labels: List[Label] = []

        if self.templates_file_path:
            os.remove(self.templates_file_path)
        assert not is_file(self.templates_file_path, raise_exception=False)
        self.templates_file_path = None
        self.templates: List[Template] = []
예제 #5
0
def upload_file_konfuzio_api(filepath: str, project_id: int, session=konfuzio_session(), dataset_status: int = 0):
    """
    Upload file to Konfuzio API.

    :param filepath: Path to file to be uploaded
    :param session: Session to connect to the server
    :param project_id: Project ID where to upload the document
    :return: Response status.
    """
    url = get_upload_document_url()
    is_file(filepath)

    with open(filepath, "rb") as f:
        file_data = f.read()

    files = {"data_file": (os.path.basename(filepath), file_data, "multipart/form-data")}
    data = {"project": project_id, "dataset_status": dataset_status}

    r = session.post(url=url, files=files, data=data)
    return r
예제 #6
0
    def get_images(self, update: bool = False):
        """
        Get document pages as png images.

        :param update: Update the downloaded images even they are already available
        :return: Path to OCR file.
        """
        session = konfuzio_session()

        self.image_paths = []
        for page in self.pages:

            if is_file(page['image'], raise_exception=False):
                self.image_paths.append(page['image'])
            else:
                page_path = os.path.join(self.root,
                                         f'page_{page["number"]}.png')
                self.image_paths.append(page_path)

                if not is_file(page_path, raise_exception=False) or update:
                    url = f'{KONFUZIO_HOST}{page["image"]}'
                    res = retry_get(session, url)
                    with open(page_path, 'wb') as f:
                        f.write(res.content)
예제 #7
0
    def get_file(self, update: bool = False):
        """
        Get OCR version of the original file.

        :param update: Update the downloaded file even if it is already available
        :return: Path to OCR file.
        """
        if self.is_without_errors and (not self.ocr_file_path or update):
            for page_index in range(0, self.number_of_pages):
                self.ocr_file_path = os.path.join(self.root, 'ocr.pdf')
                if not is_file(self.ocr_file_path,
                               raise_exception=False) or update:
                    pdf_content = download_file_konfuzio_api(
                        self.id, session=self.session)
                    with open(self.ocr_file_path, 'wb') as f:
                        f.write(pdf_content)

        return self.ocr_file_path
예제 #8
0
    def get_meta(self, update=False):
        """
        Get the list of all documents in the project and their information.

        :param update: Update the downloaded information even it is already available
        :return: Information of the documents in the project.
        """
        if not self.meta_data or update:
            self.meta_file_path = os.path.join(self.data_root, 'meta.json5')

            if not is_file(self.meta_file_path, raise_exception=False):
                self.meta_data = get_meta_of_files(self.session)
                with open(self.meta_file_path, 'w') as f:
                    json.dump(self.meta_data, f, indent=2, sort_keys=True)
            else:
                with open(self.meta_file_path, 'r') as f:
                    self.meta_data = json.load(f)

        return self.meta_data
예제 #9
0
    def get_labels(self, update=False):
        """
        Get ID and name of any label in the project.

        :param update: Update the downloaded information even it is already available
        :return: Labels in the project.
        """
        if not self.labels or update:
            self.labels_file_path = os.path.join(self.data_root,
                                                 'labels.json5')
            if not is_file(self.labels_file_path,
                           raise_exception=False) or update:
                labels_data = get_project_labels(session=self.session)
                with open(self.labels_file_path, 'w') as f:
                    json.dump(labels_data, f, indent=2, sort_keys=True)
            else:
                with open(self.labels_file_path, 'r') as f:
                    labels_data = json.load(f)
            for label_data in labels_data:
                # Remove the project from label_data as we use the already present project reference.
                label_data.pop('project', None)
                self.label_class(project=self, **label_data)

        return self.labels
 def test_update_prj(self):
     """Test number of documents after updating a project."""
     assert len(self.prj.documents) == self.document_count
     self.prj.update()
     assert len(self.prj.documents) == self.correct_document_count
     is_file(self.prj.meta_file_path)
예제 #11
0
    def get_document_details(self, update):
        """
        Get data from a document.

        :param update: Update the downloaded information even it is already available
        """
        self.annotation_file_path = os.path.join(self.root,
                                                 'annotations.json5')
        self.section_file_path = os.path.join(self.root, 'sections.json5')
        self.txt_file_path = os.path.join(self.root, 'document.txt')
        self.hocr_file_path = os.path.join(self.root, 'document.hocr')
        self.bbox_file_path = os.path.join(self.root, 'bbox.json5')

        if update or not (
                is_file(self.annotation_file_path, raise_exception=False)
                and is_file(self.section_file_path, raise_exception=False)
                and is_file(self.txt_file_path, raise_exception=False)
                and is_file(self.bbox_file_path, raise_exception=False)
                and is_file(self.pages_file_path, raise_exception=False)):

            data = get_document_details(document_id=self.id,
                                        session=self.session)
            raw_annotations = data['annotations']
            self.number_of_pages = data['number_of_pages']
            self.text = data['text']
            self.hocr = data['hocr'] or ''
            self.pages = data['pages']
            self._sections = data['sections']

            # write a file, even there are no annotations to support offline work
            with open(self.annotation_file_path, 'w') as f:
                json.dump(raw_annotations, f, indent=2, sort_keys=True)

            with open(self.section_file_path, 'w') as f:
                json.dump(data['sections'], f, indent=2, sort_keys=True)

            with open(self.txt_file_path, 'w', encoding="utf-8") as f:
                f.write(data['text'])

            with open(self.bbox_file_path, 'w') as f:
                json.dump(data['bbox'], f, indent=2, sort_keys=True)

            with open(self.pages_file_path, 'w') as f:
                json.dump(data['pages'], f, indent=2, sort_keys=True)

            if self.hocr != '':
                with open(self.hocr_file_path, 'w', encoding="utf-8") as f:
                    f.write(data['hocr'])

        else:
            with open(self.txt_file_path, 'r', encoding="utf-8") as f:
                self.text = f.read()

            with open(self.annotation_file_path, 'rb') as f:
                raw_annotations = json.loads(f.read())

            with open(self.section_file_path, 'rb') as f:
                self._sections = json.loads(f.read())

            with open(self.pages_file_path, 'rb') as f:
                self.pages = json.loads(f.read())

            if is_file(self.hocr_file_path, raise_exception=False):
                # hocr might not be available (depends on the project settings)
                with open(self.hocr_file_path, 'r', encoding="utf-8") as f:
                    self.hocr = f.read()

        # add Annotations to the document and project
        if hasattr(self, 'project') and self.project:
            for raw_annotation in raw_annotations:
                if not raw_annotation['custom_offset_string']:
                    annotation = self.annotation_class(document=self,
                                                       **raw_annotation)
                    self.add_annotation(annotation)
                else:
                    real_string = self.text[raw_annotation['start_offset']:
                                            raw_annotation['end_offset']]
                    if real_string.replace(
                            ' ',
                            '') == raw_annotation['offset_string'].replace(
                                ' ', ''):
                        annotation = self.annotation_class(document=self,
                                                           **raw_annotation)
                        self.add_annotation(annotation)
                    else:
                        logger.warning(
                            f'Annotation {raw_annotation["id"]} is a custom string and, therefore, it will not be used '
                            f'in training {KONFUZIO_HOST}/a/{raw_annotation["id"]}.'
                        )

        return self