Пример #1
0
def list_docs(project):
    path = docs_path()
    del project.active_doc
    project.active_doc = None
    save_project(project)

    with ZipFile(path, 'r') as zipfile:
        instruction = ClientInstruction({
            'pass_project': True,
            'pass_docs': True,
            'docs': get_doc_infos(project, zipfile),
            'project': project.to_dict(),
            'page': '#doc-list',
            'message': "Listing documents of project %s" % project.name
        })
    return [project, instruction]
Пример #2
0
def load_doc(name, doctype=None, project=None):
    """Load document

    Args:
        name: Name of document (file must exist in :/username/project/docs.zip: file).
        doctype: Type of document. 'pdf', 'docx', etc.
        project: reference to ArthurProject object.

    Returns:
        list: List of two objects, ArthurProject and ClientInstruction instance.
    """
    if project is None:
        instruction = ClientInstruction({"message": "Please load a project."})
    else:
        try:
            path = docs_path()
            with ZipFile(path, "r") as docs:
                text = docs.read(name)
            doc = ArthurDocument(text=text, doctype=doctype, name=name)
            project.active_doc = doc
            save_project(project)
            instruction = ClientInstruction(
                {
                    "pass_project": True,
                    "data_fields": get_docblocks(project, name),
                    "page": "#doc-view",
                    "message": 'Document "%s" loaded' % name,
                }
            )
        except OSError as e:
            if e.errno == errno.EEXIST:
                instruction = ClientInstruction({"message": "File %s does not exist." % name})
                return (project, instruction)
            else:
                instruction = ClientInstruction({"message": "OSError: " + e[1]})
                return (project, instruction)
        except IOError as e:
            instruction = ClientInstruction({"message": "IOError: " + e[1]})
            return (project, instruction)
        except KeyError as e:
            # Usually for when active_doc not found.
            instruction = ClientInstruction({"message": "KeyError: " + str(e)})
            return (project, instruction)
    return (project, instruction)
Пример #3
0
def run(project = None, args = [], **kwargs):
    """Load all documents inside a zip file in server into currently active project. Runs 'upload_zip' instead if file does not exist.

    usage: load_zip [--keep] [--nuke] name

    Args:
        name: Zip file to load.

    Optional arguments:
      --keep, -k               By default, load_zip will remove uploaded zip file. Add this option to keep that file.
      --nuke, -n               ☠ Destroy all previous documents in this project and recreate them. ☠
      --overwrite_corpus, -o   Overwrite corpus as they are created.

    """
    
    if project is None:
        instruction = ClientInstruction({'message': 'Please load a project.'})
    else:
        if len(args) == 0:
            docs = get_docs('load_zip')
            instruction = ClientInstruction({
                'message': "\n".join(docs)
            })
        else:
            parser = ArgumentParser(add_help=False)
            parser.add_argument('name')
            parser.add_argument('--keep', '-k', action='store_true')
            parser.add_argument('--nuke', '-n', action='store_true')
            parser.add_argument('--overwrite_corpus', '-o', action='store_true')
            parsed_args = parser.parse_args(args)

            name = parsed_args.name
            keep = parsed_args.keep
            nuke = parsed_args.nuke
            frompath = os.path.join(uploaded_path(), name)
            project, instruction = load_zip(project, docs_path(), frompath, keep=keep, nuke=nuke, connection=kwargs['connection'], mongo=mongo)
            save_project(project)

    return [project, instruction]
Пример #4
0
def load_zip(project, docs_path, zip_path, keep=False, nuke=False, connection=None, mongo=None):
    """Loads documents from zip_path into project.

    Args:
        project: ArthurProject object documents will be loaded into.
        docs_path: Path to docs.zip of a project.
        zip_path: Path to zip file containing documents to load.
        keep: By default, load_zip will remove uploaded zip file. Set this to True to keep that file.
        connection: Pass a sockjs.tornado.SockJSConnection object to update progress dynamically,
                    otherwise print will be used.

    While loading each document, its content will be clustered into several data_fields which will
    then be used for learning algorithms.
    >>> from libs.arthur import ArthurProject
    >>> docs_path = os.path.join(base_path, 'test', 'unit', 'test_project', 'docs.zip')
    >>> project = ArthurProject('test_project')
    >>> zip_path = os.path.join(base_path, 'test', 'unit', 'test.zip')
    >>> project, instruction = load_zip(project, docs_path, zip_path, keep=True)
    Found document "11758.docx". Created 0 data_fields (not entered into database).
    Found document "348418.pdf". Created 50 data_fields (not entered into database).
    Found document "348608.pdf". Created 53 data_fields (not entered into database).

    >>> print(instruction.get_value('message')) # doctest:+ELLIPSIS
    Loaded ...

    Data fields should then be available for all documents.

    Args:
        project: ArthurProject instance to be updated.
        zip_path (str): Path to zip file containing documents to be loaded.
        keep (bool): If True, keep loaded zip file, otherwise delete it after all documents
                     loaded into project. Defaults to False.
        mongo (MongoClient): If not empty, will store blocks into mongodb database.

    Returns:
        list: [ArthurProject instance, ClientInstruction instance]
    """
    found = []
    added = []
    try:
        fromzip = ZipFile(zip_path, 'r')
        try:
            topath = docs_path
            mode = 'a'

            if nuke:
                mode = 'w'
                project.nuke_docs(corpus_dir=corpus_path())
                project.active_doc = None
                message = "☠ - Nuked project's documents - No Survivor!"
                if isinstance(connection, sockjs.tornado.SockJSConnection):
                    connection.send(message)
                else:
                    print(message)
                if mongo:
                    mongo.db.data_fields.delete_many({'project_id': project._id})

            # Create corpus
            send(connection, "Attempt to create corpus...")
            create_corpus(fromzip, corpus_path(), stdout=connection, overwrite=overwrite_corpus)

            tozip = ZipFile(topath, mode)

            for docname in fromzip.namelist():
                content = fromzip.read(docname)
                try:
                    tozip.getinfo(docname)
                    found.append(docname)
                    message = "Found document \"%s\"." % docname
                except:
                    # Add to project's list of documents
                    tozip.writestr(docname, content)
                    added.append(docname)
                    message = "Loaded document \"%s\"." % docname
                # Checks if data fields have been created for this document, add if they haven't.
                documents = filter(lambda d: d.name == docname, project.docs)
                new_data_fields = []
                if len(documents) == 0:
                    document = ArthurDocument(content, name=docname, project_id=project._id, _id=ObjectId())
                    project.docs.append(document)
                    if mongo is not None:
                        # Todo: Use transaction and save in bulk instead.
                        save_project(project)
                    have_data_fields = False
                else:
                    document = documents[0]
                    have_data_fields = True
                    if document.num_data_fields == 0:
                        have_data_fields = False

                if have_data_fields:
                    message += " Already has %i data fields." % document.num_blocks 
                else:
                    new_data_fields = read(document, project_id=project._id)
                    if mongo is not None:
                        if len(new_data_fields) == 0:
                            total_inserted_data_fields = 0
                        else:
                            total_inserted_data_fields = len(mongo.db.data_fields.insert_many(new_data_fields).inserted_ids)
                            save_project(project)
                        message += " Created %i data fields and entered them into database." % (total_inserted_data_fields)
                    else:
                        message += " Created %i data fields (not entered into database)." % (len(new_data_fields))

                send(connection, message)

        except KeyError as e:
            instruction = ClientInstruction({'message': e[1]})
            return (project, instruction)
    except OSError as e:
        if e.errno == errno.EEXIST:
            instruction = ClientInstruction({'message': "File %s does not exist." % name})
            return (project, instruction)
        else:
            instruction = ClientInstruction({'message': e[1]})
            return (project, instruction)
    except IOError as e:
        instruction = ClientInstruction({'message': e[1]})
        return (project, instruction)

    fromzip.close()
    tozip.close()
    del_msg = ''
    if not keep:
        os.remove(zip_path)
        del_msg = " Zip file \"%s\" deleted." % name

    instruction = ClientInstruction({
        'detail': {'found': found, 'added': added},
        'message': "Loaded %i files. %i files already found in storage.%s" % (len(added), len(found), del_msg)
    })
    return [project, instruction]