def export_document_files(self, ids: List[int], target_path: str):
     storage = get_file_storage()
     file_paths = self.document_repository.get_document_source_paths_by_id(
         ids)
     for doc_id, file_path in file_paths:
         if not file_path:
             self.log_message(
                 f"Document #{doc_id} doesn't have a link to the original file"
             )
             continue
         doc_file_name = os.path.basename(file_path)
         new_name = f'{doc_id}_{doc_file_name}'
         target_filepath = os.path.join(target_path, new_name)
         try:
             doc_stor_path = storage.sub_path_join(storage.documents_path,
                                                   file_path)
             file_obj = storage.read(doc_stor_path)
             if not file_obj:
                 self.log_message(
                     f'Exporting document file "{file_path}" was not found')
                 continue
             with open(target_filepath, 'wb') as fw:
                 fw.write(file_obj)
         except Exception as e:
             self.log_message(f'Error storing "{doc_file_name}": {e}')
def move_classifier(apps, schema_editor):
    # move "models/en/contract_class" folder's content
    # to models/en/contract_type_classifier/document
    src = 'models/en/contract_class'
    dest = 'models/en/contract_type_classifier/document'
    file_storage = get_file_storage()
    file_storage.ensure_folder_exists(dest)
    files_moved = 0
    for file in file_storage.list(src):
        file_name_only = os.path.basename(file)
        dest_path = os.path.join(dest, file_name_only)
        file_storage.rename_file(file, dest_path, move_file=True)
        files_moved += 1
    print(f'{files_moved} files are moved to "{dest}"')

    # create MLModel record
    with connection.cursor() as cursor:
        cursor.execute(
            f"""
            INSERT INTO analyze_mlmodel (name, version, vector_name, 
                model_path, is_active, "default", apply_to, target_entity, 
                language, project_id) 
            VALUES
                ('Document contract class classifier (en)', '', '',
                 %s, true, true, 'document', 'contract_type_classifier',
                 'en', null) ON CONFLICT DO NOTHING;""", [dest])
 def rename_old_document(self, doc_id) -> None:
     doc = Document.all_objects.get(pk=doc_id)  # type: Document
     new_name, new_path = self.make_new_doc_name(doc)
     # rename file and document itself
     from apps.common.file_storage import get_file_storage
     stor = get_file_storage()
     try:
         stor.rename_document(doc.source_path, new_path)
         self.log_func(
             f'ForceUnique: "{doc.source_path}" is renamed to "{new_path}"')
     except Exception as ex:
         self.log_func(
             f'ForceUnique: error while renaming "{doc.source_path}" to "{new_path}":\n'
             + str(ex))
         # return  # zombie document detected
     try:
         doc.source_path = new_path
         doc.name = os.path.basename(doc.source_path)
         doc.save()
     except Exception as ex:
         msg = f'ForceUnique: error while saving renamed doc at {doc.source_path}:\n' +\
               str(ex)
         self.log_func(msg)
         raise Exception(msg)
     try:
         # "reindex" - update document's name in cache
         from apps.rawdb.field_value_tables import update_document_name
         update_document_name(doc.pk, doc.name)
     except Exception as ex:
         msg = f'ForceUnique: error updating RawDB cache (name) {doc.name}:\n' +\
               str(ex)
         self.log_func(msg)
 def delete_document_files(paths: List[str],
                           logger: Callable = None) -> None:
     stor = get_file_storage()
     for path in paths:
         try:
             stor.delete_document(path)
         except Exception as e:
             msg = f'Unable to delete file "{path}" in {type(stor).__name__}'
             DocumentFilesCleaner.log_error(msg, e, logger)
Exemplo n.º 5
0
    def delete_document_files(paths: List[str]):
        stor = get_file_storage()
        for path in paths:
            try:
                stor.delete_document(path)

            except Exception as e:
                raise Exception(f'DocumentFilesCleaner: error deleting ' +
                                f'"{path}": {e}') from e
Exemplo n.º 6
0
 def download_file_data(self, request, *_args, **kwargs):
     exp_file = ExportFile.objects.get(
         pk=kwargs['object_id'])  # type: ExportFile
     storage = get_file_storage()
     file_data = storage.read(exp_file.file_path)
     file_name = os.path.basename(exp_file.file_path)
     response = HttpResponse(file_data, content_type='application/zip')
     response['Content-Disposition'] = f'attachment; filename="{file_name}"'
     response['Content-Length'] = len(file_data)
     response['filename'] = file_name
     exp_file.downloaded = True
     exp_file.save()
     return response
Exemplo n.º 7
0
def get_notification_template_resource(rfn: str) -> Optional[bytes]:
    fn = os.path.normpath(os.path.join(settings.NOTIFICATION_CUSTOM_TEMPLATES_PATH_IN_MEDIA, rfn))
    if not fn.startswith(settings.NOTIFICATION_CUSTOM_TEMPLATES_PATH_IN_MEDIA):
        raise RuntimeError('File name should be inside its parent dir: {0}'.format(rfn))

    res = get_file_storage().read(fn)
    if res:
        return res
    fn = os.path.normpath(os.path.join(settings.NOTIFICATION_EMBEDDED_TEMPLATES_PATH, rfn))
    if not fn.startswith(settings.NOTIFICATION_EMBEDDED_TEMPLATES_PATH):
        raise RuntimeError('File name should be inside its parent dir: {0}'.format(rfn))

    with open(fn, 'br') as f:
        return f.read()
Exemplo n.º 8
0
def normalize(task_id, key, value):
    DB_CACHED_FILE_LIMIT = 1024 * 1024 * 100
    try:
        json.dumps(value)
        return value
    except TypeError:
        if isinstance(value, models.Model):
            return SimpleObjectSerializer().serialize([value]).pop()
        elif isinstance(value, QuerySet):
            return SimpleObjectSerializer().serialize(value)
        elif isinstance(value, (dict, list, tuple, set)):
            return pre_serialize(task_id, key, value)
        elif isinstance(value, UploadedFile):
            uploaded_file = value  # type: UploadedFile
            if uploaded_file.size < DB_CACHED_FILE_LIMIT:
                cache_key = str(task_id) + '__' + str(key) if key else str(
                    task_id)
                DbCache.put_to_db(cache_key, uploaded_file.read())
                return {
                    'file_name': uploaded_file.name,
                    'cache_key': cache_key
                }
            else:
                file_ref = ExportFile()
                file_ref.created_time = datetime.datetime.utcnow()
                file_ref.expires_at = datetime.datetime.utcnow(
                ) + datetime.timedelta(hours=1)
                file_ref.comment = f'Import documents from "{len(uploaded_file.name)}" file'
                time_part = str(datetime.datetime.utcnow()).replace(
                    '.', '_').replace(':', '_').replace(' ', '_')
                file_name = f'doc_export_{os.path.splitext(uploaded_file.name)[0]}_{time_part}.zip'

                storage = get_file_storage()
                docs_subfolder = storage.sub_path_join(storage.export_path,
                                                       'documents')
                try:
                    storage.mkdir(docs_subfolder)
                except:
                    pass
                file_ref.file_path = storage.sub_path_join(
                    docs_subfolder, file_name)
                storage.write_file(file_ref.file_path, uploaded_file,
                                   uploaded_file.size)
                file_ref.file_created = True
                file_ref.stored_time = datetime.datetime.utcnow()
                file_ref.save()
                return {'file_ref_id': file_ref.pk}

        return str(value)
    def __init__(self):
        self.calc = RatingCalculator()

        fstor = get_file_storage()
        extra_language_paths = fstor.list(CUSTOM_LANG_STORAGE_FOLDER)
        for file_path in extra_language_paths:
            file_data = fstor.read(file_path)
            with tempfile.NamedTemporaryFile() as fw:
                fw.write(file_data)
                lang_df = pandas.read_pickle(fw.name)
                lang, _ = os.path.splitext(os.path.basename(file_path))
                self.calc.distribution_by_lang[lang] = lang_df
        # load default lang features
        self.calc.init_language_data([
            os.path.join(os.path.dirname(lexnlp_ocr_path.__file__),
                         './reference_vectors')
        ])
Exemplo n.º 10
0
    def create_document(task: ExtendedTask, uri: str, project_id, run_detect_field_values):
        file_storage = get_file_storage()
        with file_storage.get_document_as_local_fn(uri) as (fn, file_name):
            task.task.title = 'Load Document: {0}'.format(uri)
            task.log_extra = {'log_document_name': uri}

            with open(fn, encoding='utf-8') as data_file:
                data = json.loads(data_file.read())
                project = Project.objects.get(pk=project_id)
                document_type = project.type
                document = Document(
                    name=file_name,
                    project=project,
                    document_type=document_type,
                    metadata={'parsed_by': None}
                )
                LoadDocumentWithFields.load_doc(task, document, data, run_detect_field_values)
Exemplo n.º 11
0
    def batch_upload(self, request, **kwargs):
        """
        Upload files from given sub-folder in media/data/documents folder\n
            Params:
                - source_path: relative path to a folder with documents
                - send_email_notifications: bool (optional) - sent notification email that batch uploading started
        """
        session = self.get_object()
        session_id = session.pk
        project = session.project
        folder_name = request.POST.get('folder') or request.POST.get(
            'source_path')

        if not session_id or not folder_name:
            raise ValidationError('Provide session id and folder name.')

        file_list = get_file_storage().list_documents(folder_name)
        # TODO: limit file size - see def upload()
        for file_path in file_list:
            file_name = os.path.basename(file_path)

            # Code for running locators and detecting field values has been moved to LoadDocuments task
            # for the unification purposes between old and new ui.

            call_task(task_name='LoadDocuments',
                      source_data=file_path,
                      user_id=request.user.id,
                      session_id=session_id,
                      metadata={
                          'session_id': session_id,
                          'file_name': file_name
                      },
                      run_standard_locators=True,
                      linked_tasks=None)

        if project.send_email_notification and \
                request.POST.get('send_email_notifications') == 'true' and \
                not session.notified_upload_started:
            self._notify_upload_started(session)

        return Response('Started')
Exemplo n.º 12
0
def download_task_attached_file(
        document_import_file: Dict[str, Any]) -> Generator[str, None, None]:
    if 'cache_key' in document_import_file:
        # download from DB cache
        zip_bytes = DbCache.get(document_import_file['cache_key'])
        ext = os.path.splitext(
            document_import_file['file_name'])[1][1:].lower()
        _fd, fn = tempfile.mkstemp(suffix=ext)
        try:
            with open(fn, 'wb') as fw:
                fw.write(zip_bytes)
                yield fn  # TODO: fix yield ...
        finally:
            DbCache.clean_cache(document_import_file['cache_key'])
    else:
        # download from file storage cache
        file_ref_id = document_import_file['file_ref_id']
        file_ref = ExportFile.objects.get(pk=file_ref_id)  # type: ExportFile
        storage = get_file_storage()
        with storage.get_as_local_fn(file_ref.file_path) as f_path:
            yield f_path[0]
Exemplo n.º 13
0
    def import_doc_files(self):
        storage = get_file_storage()
        file_ptrn = re.compile(r'^\d+_.*')
        for name_only in os.listdir(self.source_path):
            if not file_ptrn.match(name_only):
                continue
            doc_id = int(name_only.split('_')[0])
            doc_id = self.document_ids.get(doc_id)
            if not doc_id:
                self.log_error(
                    f'File "{name_only}" - migrated doc was not found')
                continue
            dest_file_path = self.document_src_paths.get(doc_id)
            if not dest_file_path:
                self.log_error(
                    f'File "{name_only}", #{doc_id} - document source path was not found'
                )
                continue

            if storage.document_exists(dest_file_path):
                self.log_info(f'Document "{dest_file_path}" already exists')
                continue

            src_file_path = os.path.join(self.source_path, name_only)
            with open(src_file_path, 'rb') as fr:
                content = fr.read()
            # ensure the subfolder exists
            doc_folder = os.path.dirname(dest_file_path)
            if doc_folder:
                try:
                    storage.mk_doc_dir(doc_folder)
                except:
                    # folder might be already created
                    pass
            try:
                storage.write_document(dest_file_path, content, len(content))
            except Exception as e:
                self.log_error(f'Error storing file "{dest_file_path}": {e}')
                raise
Exemplo n.º 14
0
    def process(self, **kwargs):
        self.log_info('Going to load document with fields...')

        document_name = kwargs.get('document_name')
        project = Project.objects.get(pk=kwargs.get('project_id'))  # type: Project
        run_detect_field_values = bool(kwargs.get('run_detect_field_values'))

        document_fields = kwargs.get('document_fields') or {}  # type: Dict

        file_storage = get_file_storage()

        if document_fields:
            document = Document(
                name=document_name,
                project=project,
                document_type=project.type,
            )
            LoadDocumentWithFields.load_doc(self, document, document_fields, run_detect_field_values)

        path = kwargs['source_data']

        if path:
            self.log_info('Parse {0} at {1}'.format(path, file_storage))
            file_list = file_storage.list_documents(path)

            self.log_info("Detected {0} files. Added {0} subtasks.".format(len(file_list)))

            if len(file_list) == 0:
                raise RuntimeError('Wrong file or directory name or directory is empty: {}'
                                   .format(path))
            load_docs_args = [(file_path, project.pk, run_detect_field_values)
                              for file_path in file_list]
            self.run_sub_tasks('Load Each Document',
                               LoadDocumentWithFields.create_document,
                               load_docs_args,
                               file_list)
Exemplo n.º 15
0
# Project imports
from apps.common.file_storage import get_file_storage
from apps.document.repository.document_field_repository import DocumentFieldRepository
from apps.document.repository.document_repository import DocumentRepository
from apps.task.models import Task
from apps.task.tasks import purge_task

__author__ = "ContraxSuite, LLC; LexPredict, LLC"
__copyright__ = "Copyright 2015-2020, ContraxSuite, LLC"
__license__ = "https://github.com/LexPredict/lexpredict-contraxsuite/blob/1.7.0/LICENSE"
__version__ = "1.7.0"
__maintainer__ = "LexPredict, LLC"
__email__ = "*****@*****.**"

file_storage = get_file_storage()


def cleanup_document_relations(document):

    # 1. delete history
    document_repo = DocumentRepository()
    field_repo = DocumentFieldRepository()
    document_repo.delete_document_history_by_ids([document.pk])
    field_repo.delete_document_history_values(document.pk)

    # INFO: skip "delete step" (set delete=False) since we clean tasks periodically now
    # 2. delete Tasks, Task history, TaskResults, child tasks
    if document.metadata and document.metadata.get('cascade_delete_tasks',
                                                   True):
        task_kwargs = dict(file_name=document.name)
Exemplo n.º 16
0
 def get_file_storage(cls) -> ContraxsuiteFileStorage:
     if not cls.file_storage:
         cls.file_storage = get_file_storage()
     return cls.file_storage
 def delete_document_files(paths: List[str]):
     stor = get_file_storage()
     for path in paths:
         stor.delete_document(path)
Exemplo n.º 18
0
    def sync_imanage_document(task: ExtendedTask, imanage_config_id: int, imanage_doc_id: str):
        task.log_info('Synchronizing iManage document #{0} or config #{1}'.format(imanage_doc_id, imanage_config_id))
        imanage_doc = IManageDocument.objects \
            .filter(imanage_config_id=imanage_config_id, imanage_doc_id=imanage_doc_id) \
            .select_related('imanage_config').get()
        file_storage = get_file_storage()
        try:
            imanage_config = imanage_doc.imanage_config
            log = CeleryTaskLogger(task)
            project = imanage_config.resolve_dst_project(imanage_doc.imanage_doc_data, log)
            project_id = project.pk

            assignee = imanage_config.resolve_assignee(imanage_doc.imanage_doc_data, log)
            assignee_id = assignee.pk if assignee else None
            task.log_info('Assignee resolved to: {0}'.format(assignee.get_full_name() if assignee else '<no assignee>'))

            task.log_info('Downloading iManage document contents into a temp file...')
            auth_token = imanage_config.login()
            filename, response = imanage_config.load_document(auth_token, imanage_doc_id)

            upload_session_id = str(uuid.uuid4())
            filename = get_valid_filename(filename)
            rel_filepath = os.path.join(upload_session_id, filename)

            _, ext = os.path.splitext(filename) if filename else None
            with buffer_contents_into_temp_file(response, ext) as temp_fn:

                # upload file to file storage
                with open(temp_fn, 'rb') as f:
                    file_storage.mk_doc_dir(upload_session_id)
                    file_storage.write_document(rel_filepath, f)

                kwargs = {
                    'document_type_id': imanage_config.document_type_id,
                    'project_id': project_id,
                    'assignee_id': assignee_id,
                    'user_id': get_main_admin_user().pk,
                    'propagate_exception': True,
                    'run_standard_locators': True,
                    'metadata': {},
                    'do_not_check_exists': True
                }

                pre_defined_fields = None
                if imanage_doc.imanage_doc_data and imanage_config.imanage_to_contraxsuite_field_binding:
                    pre_defined_fields = dict()
                    for imanage_field_code, contraxsuite_field_code \
                            in dict(imanage_config.imanage_to_contraxsuite_field_binding).items():
                        imanage_field_value = imanage_doc.imanage_doc_data.get(imanage_field_code)
                        if imanage_field_value:
                            pre_defined_fields[contraxsuite_field_code] = imanage_field_value
                            task.log_info('Assigning iManage field {0} to Contraxsuite field {1}: {2}'
                                          .format(imanage_field_code, contraxsuite_field_code, imanage_field_value))
                        else:
                            task.log_info('iManage field {0} has no value assigned.'
                                          .format(imanage_field_code))
                else:
                    task.log_info('No binding of iManage fields to Contraxsuite fields.')

                document_id = LoadDocuments \
                    .create_document_local(task, temp_fn, rel_filepath, kwargs,
                                           return_doc_id=True,
                                           pre_defined_doc_fields_code_to_python_val=pre_defined_fields)

                if document_id:
                    task.log_info('Created Contraxsuite document #{0}'.format(document_id))
                    imanage_doc.document_id = document_id
                    imanage_doc.last_sync_date = timezone.now()
                    imanage_doc.save(update_fields=['document_id', 'last_sync_date'])
                else:
                    task.log_error('Unable to create Contraxsuite document for '
                                   'iManage document #{0}'.format(imanage_doc_id))
                    raise RuntimeError('No document loaded.')
        except Exception as ex:
            msg = render_error('Unable to synchronize iManage document #{0}'.format(imanage_doc_id), ex)
            task.log_error(msg)
            imanage_doc.import_problem = True
            imanage_doc.save(update_fields=['import_problem'])