Python get_master_document_corpus_list示例，jassrealtime.core.master_factory_list.get_master_document_corpus_list Python示例

示例#1

0

显示文件

 def delete(self, corpusId, bucketId):
     try:
         envId = get_env_id()
         authorization = get_autorisation(envId, None, None)
         corpus = get_master_document_corpus_list(
             envId, authorization).get_corpus(corpusId)
         corpus.delete_bucket(bucketId)
         self.write_and_set_status(None, HTTPStatus.NO_CONTENT)
     except BucketNotFoundException as err:
         self.write_and_set_status(
             {
                 MESSAGE:
                 "Bucket does not exist.Extra info: '{0}'".format(err)
             }, HTTPStatus.NOT_FOUND)
     except CorpusNotFoundException as err:
         self.write_and_set_status(
             {
                 MESSAGE:
                 "Corpus does not exist.Extra info: '{0}'".format(err)
             }, HTTPStatus.NOT_FOUND)
     except Exception:
         trace = traceback.format_exc().splitlines()
         self.write_and_set_status(
             {
                 MESSAGE: "Internal server error",
                 TRACE: trace
             }, HTTPStatus.INTERNAL_SERVER_ERROR)

示例#2

0

显示文件

文件： document.py 项目： crim-ca/RACS

    def get(self, corpusId, documentId):
        """Get a single document from corpus"""
        try:
            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)
            corpus = get_master_document_corpus_list(
                envId, authorization).get_corpus(corpusId)
            document = corpus.get_text_document(documentId)

            if document is None:
                raise DocumentNotFoundException(documentId)

            self.write_and_set_status(document, HTTPStatus.OK)
        except CorpusNotFoundException:
            self.write_and_set_status({MESSAGE: "Specified corpus not found"},
                                      HTTPStatus.NOT_FOUND)
        except DocumentNotFoundException:
            self.write_and_set_status(
                {MESSAGE: "Specified document not found"},
                HTTPStatus.NOT_FOUND)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "Internal server error",
                    TRACE: trace
                }, HTTPStatus.INTERNAL_SERVER_ERROR)

示例#3

0

显示文件

文件： document.py 项目： crim-ca/RACS

    def delete(self, corpusId, documentId):
        """Delete a single document an optionally its annotations"""
        try:
            delete_annotations_argument = self.get_query_argument(
                "deleteAnnotations", None)
            if not delete_annotations_argument:
                self.missing_required_field("deleteAnnotations")
                return

            delete_annotations = 'true' == delete_annotations_argument

            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)
            corpus = get_master_document_corpus_list(
                envId, authorization).get_corpus(corpusId)
            document = corpus.delete_document(documentId, delete_annotations)
            self.write_and_set_status(document, HTTPStatus.OK)
        except CorpusNotFoundException:
            self.write_and_set_status({MESSAGE: "Specified corpus not found"},
                                      HTTPStatus.NOT_FOUND)
        except DocumentNotFoundException:
            self.write_and_set_status(
                {MESSAGE: "Specified document not found"},
                HTTPStatus.NOT_FOUND)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "Internal server error",
                    TRACE: trace
                }, HTTPStatus.INTERNAL_SERVER_ERROR)

示例#4

0

显示文件

文件： document.py 项目： crim-ca/RACS

    def get(self, corpusId):
        """Get documents from corpus according to pagination"""
        try:
            fromIndexArgument = self.get_query_argument("from")
            fromIndex = int(fromIndexArgument)
            if fromIndex < 0:
                self.write_and_set_status(
                    {MESSAGE: "'from' must cannot be less than zero"},
                    HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            sizeArgument = self.get_query_argument("size")
            size = int(sizeArgument)

            if size < 1:
                self.write_and_set_status(
                    {MESSAGE: "'size' cannot be less than 1"},
                    HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            size = min(size, MAX_DOCUMENT_SIZE)

            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)

            corpus = get_master_document_corpus_list(
                envId, authorization).get_corpus(corpusId)
            filterTitle = self.get_query_argument("filterTitle", default=None)
            filterSource = self.get_query_argument("filterSource",
                                                   default=None)
            filterJoin = self.get_query_argument("filterJoin", default=None)
            sortBy = self.get_query_argument("sortBy", default=None)
            sortOrder = self.get_query_argument("sortOrder", default=None)
            documents = corpus.get_text_documents(fromIndex, size, sortBy,
                                                  sortOrder, filterTitle,
                                                  filterSource, filterJoin)

            self.write_and_set_status({"documents": documents}, HTTPStatus.OK)
        except CorpusNotFoundException:
            self.write_and_set_status({MESSAGE: "Specified corpus not found"},
                                      HTTPStatus.NOT_FOUND)
        except ValueError as ve:
            self.write_and_set_status(
                {MESSAGE: "Invalid 'from' or 'size' parameter"},
                HTTPStatus.UNPROCESSABLE_ENTITY)
        except TransportError as te:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "ES TransportError",
                    TRACE: trace
                }, te.status_code)
        except Exception as e:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "Internal server error",
                    TRACE: trace
                }, HTTPStatus.INTERNAL_SERVER_ERROR)

示例#5

0

显示文件

文件： test_document.py 项目： crim-ca/RACS

    def test_remove_all_remove_annotations(self):
        self.recreate_read_write_env()
        jsonSchema1 = {
            "$schema": "http://json-schema.org/draft-04/schema#",
            "targetType": "document_surface1d",
            "schemaType": "schema1",
            "type": "object",
            "required": ["_schemaType", "_corpusID", "_documentID", "offsets"],
            "properties": {
                "_schemaType": {
                    "type": "string",
                    "description": "Schema type",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "_documentID": {
                    "type": "string",
                    "description": "Internal document GUID",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                },
                "_corpusID": {
                    "type": "string",
                    "description": "Internal Corpus GUID",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                }
            }
        }

        corpus = get_master_document_corpus_list(
            self.envId, self.authorization).create_corpus("corpusx")
        corpus.add_text_document("Another doc with auto id", "do1", "english")
        bucket1 = corpus.create_bucket("bucket1", "bucket1")
        schemaId1 = get_schema_list(
            self.envId, self.authorization).add_json_schema_as_hash(
                jsonSchema1, False, {})
        time.sleep(1)
        bucket1.add_or_update_schema_to_bucket(
            schemaId1, "schema1", TargetType("document_surface1d"), {})
        time.sleep(1)
        anno1 = {
            "_schemaType": "schema1",
            "_documentID": "document1",
            "_corpusID": "corpusx"
        }
        bucket1.add_annotation(anno1, "schema1", "1")
        time.sleep(1)
        documentSearch = DocumentSearch(self.envId, self.authorization, None,
                                        "corpusx")
        documentSearch.delete_annotations_for_types("bucket1", ["schema1"])
        time.sleep(1)
        bucket1.add_annotation(anno1, "schema1", "1")

示例#6

0

显示文件

文件： corpus.py 项目： crim-ca/RACS

 def get(self):
     try:
         envId = get_env_id()
         authorization = get_autorisation(envId, None, None)
         corpora = get_master_document_corpus_list(envId, authorization)
         corporaInfos = corpora.get_corpuses_list()
         self.write_and_set_status({"data": corporaInfos},
                                   HTTPStatus.OK)
     except Exception:
         trace = traceback.format_exc().splitlines()
         self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace},
                                   HTTPStatus.INTERNAL_SERVER_ERROR)

示例#7

0

显示文件

文件： document_corpus_test.py 项目： crim-ca/RACS

 def set_up_corpus(self):
     corpus = get_master_document_corpus_list(
         self.envId, self.authorization).create_corpus(CORPUS_ID,
                                                       languages=["en-US"])
     files = glob.iglob(os.path.join(JASS_TEST_DATA_PATH, "*.txt"))
     self.contentById = dict()
     for filePath in files:
         with open(filePath, 'r', encoding="utf8") as f:
             id = os.path.basename(filePath)
             contents = f.read()
             self.contentById[str(id) + ".txt"] = contents
             corpus.add_text_document(contents, filePath, "en-US", id)
     time.sleep(1)

示例#8

0

显示文件

文件： corpus.py 项目： crim-ca/RACS

    def post(self):
        body = self.request.body.decode("utf-8")
        try:
            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)
            json_args = json.loads(body)
            for requiredField in [CORPUS_LANGUAGES]:
                if requiredField not in json_args:
                    self.write_and_set_status({MESSAGE: "Missing required parameters. {0}".format(requiredField)},
                                              HTTPStatus.UNPROCESSABLE_ENTITY)
                    return

            languages = json_args.get(CORPUS_LANGUAGES, None)
            try:
                languageManager = get_language_manager()
                for language in languages:
                    if not languageManager.has_es_analyser(language):
                        self.write_and_set_status({MESSAGE: "Invalid language: " + language},
                                                  HTTPStatus.UNPROCESSABLE_ENTITY)
                        return
            except Exception as e:
                self.write_and_set_status({MESSAGE: "Invalid languages field: " + str(languages)},
                                          HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            corpusId = json_args.get(CORPUS_ID, None)

            if corpusId and not valid_es_id(corpusId):
                self.write_and_set_status({
                                              MESSAGE: "Corpus id invalid '{0}' . CorpusId can only be lowercase,alphanumeric with -_".format(
                                                  corpusId)},
                                          HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            corpora = get_master_document_corpus_list(envId, authorization)
            corpus = corpora.create_corpus(corpusId, languages)
            self.write_and_set_status({"id": corpus.id},
                                      HTTPStatus.OK)
        except CorpusNotFoundException:
            self.write_and_set_status({MESSAGE: "Specified corpus not found"},
                                      HTTPStatus.NOT_FOUND)
        except CorpusInvalidFieldException as ci:
            self.write_and_set_status({MESSAGE: "Invalid field: {0}".format(ci)},
                                      HTTPStatus.UNPROCESSABLE_ENTITY)
        except CorpusAlreadyExistsException:
            self.write_and_set_status({MESSAGE: "Corpus with the same id already exists"},
                                      HTTPStatus.CONFLICT)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace},
                                      HTTPStatus.INTERNAL_SERVER_ERROR)

示例#9

0

显示文件

文件： corpus.py 项目： crim-ca/RACS

 def delete(self, corpusId):
     try:
         envId = get_env_id()
         authorization = get_autorisation(envId, None, None)
         corpora = get_master_document_corpus_list(envId, authorization)
         corpora.delete_corpus(corpusId)
         self.write_and_set_status(None, HTTPStatus.NO_CONTENT)
     except CorpusNotFoundException:
         self.write_and_set_status({MESSAGE: "Specified corpus not found"},
                                   HTTPStatus.NOT_FOUND)
     except Exception:
         trace = traceback.format_exc().splitlines()
         self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace},
                                   HTTPStatus.INTERNAL_SERVER_ERROR)

示例#10

0

显示文件

文件： integration.py 项目： crim-ca/RACS

    def test_create_bucket_latencies2(self):
        """
        In reference to https://www.crim.ca/jira/browse/PSC-558
        Testing a 10 second timeout to elasticsearch
        REQUIRES dev/compose_with_toxiproxy.yml to run in order to be executed
        :return:
        """

        from elasticsearch import ConnectionTimeout
        self.setup_unittest_environment()
        corpus = get_master_document_corpus_list(
            self.envId, self.authorization).create_corpus("corpus1")

        #######  Latency less then timeout #########
        try:
            self.create_es_toxiproxy()
            self.toxiproxy_add_timeout("proxy_es", timeout=1)
            # setting es timeout to be superior to latency delay
            self.set_es_to_use_toxic(9201, timeout=2)
            # leave some time to add toxics

            time.sleep(1)
            bucket1 = corpus.create_bucket("bucket1")

        except Exception as e:
            self.reset_es_settings()
            self.destroy_ex_toxiproxy()
            self.assertTrue(False, "Exception has occured:" + str(e))

        self.reset_es_settings()
        self.destroy_ex_toxiproxy()

        #######  Latency more then timeout #########
        try:
            self.create_es_toxiproxy()
            self.toxiproxy_add_timeout("proxy_es", timeout=2)
            # setting es timeout to be inferior to latency delay
            self.set_es_to_use_toxic(9201, timeout=1)
            # leave some time to add toxics
            time.sleep(1)

            self.assertRaises(ConnectionTimeout, corpus.create_bucket,
                              "bucket2")
        except Exception as e:
            self.reset_es_settings()
            self.destroy_ex_toxiproxy()
            self.assertTrue(False, "Exception has occured:" + str(e))

        self.reset_es_settings()
        self.destroy_ex_toxiproxy()

示例#11

0

显示文件

文件： test_bucket.py 项目： crim-ca/RACS

    def test_bind_schema_with_string_array(self):
        schema = json.loads(JSON_SCHEMA_WITH_STRING_ARRAY)

        corpus = get_master_document_corpus_list(
            self.envId, self.authorization).create_corpus("corpus1")
        bucket1 = corpus.create_bucket("bucket1")
        schema_id = get_schema_list(
            self.envId, self.authorization).add_json_schema_as_hash(
                schema, False, nestedFields=["offsets"])
        time.sleep(1)

        bucket1.add_or_update_schema_to_bucket(schema_id, "schema1",
                                               TargetType("document"), {})
        time.sleep(1)
        res = bucket1.get_schemas_info(True)
        self.assertEqual(len(res["data"]), 1)

示例#12

0

显示文件

    def upload_documents(self,
                         url: str = None,
                         zipFileName: str = None,
                         isSendPut=False,
                         isMultipart: bool = True,
                         multipartFieldName: str = "file"):
        """
        Uploads all document for the current corpus
        :param url: Url to which to upload files
        :param zipFileName: Url to which to upload files
        :return:
        """

        # creates a zip file
        logger = logging.getLogger(__name__)
        fileStorage = HttpPostFileStorage(url, zipFileName)
        fileStorage.create_zip_file()
        es = get_es_conn()
        corpus = get_master_document_corpus_list(
            self.envId, self.authorization).get_corpus(self.corpusId)
        search = Search(using=es, index=corpus.languages_indices())
        search = search.source(["text"])
        search = search.params(scroll=get_scan_scroll_duration(),
                               size=get_nb_documents_per_scan_scroll())

        start = time.time()
        count = 0
        logger.info("Adding documents to zip: {0}".format(self.corpusId))
        for result in search.scan():
            fileStorage.add_utf8_file(result.text[0],
                                      str(result.meta.id) + ".txt")
            count += 1
            if count % NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING == 0:
                end = time.time()
                logger.info(
                    "Time to add documents {0} to {1} : {2} seconds".format(
                        count - NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING, count,
                        end - start))
                start = end

        end = time.time()
        logger.info("Time to add documents {0} to {1} : {2} seconds".format(
            count - count % NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING, count,
            end - start))

        fileStorage.flush(True, isSendPut, isMultipart, multipartFieldName)

示例#13

0

显示文件

文件： document.py 项目： crim-ca/RACS

    def post(self, corpusId):
        try:
            body = json.loads(self.request.body.decode("utf-8"))

            language = body.get("language")
            if not language:
                self.write_and_set_status(
                    {MESSAGE: "Missing required parameters"})
                self.set_status(HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)

            docId = body.get(
                "id")  # Note: 'get' defaults to None when key does not exist
            text = body.get("text", "")
            title = body.get("title", "")
            source = body.get("source", "")

            corpus = get_master_document_corpus_list(
                envId, authorization).get_corpus(corpusId)
            if not language in corpus.languages:
                self.write_and_set_status(
                    {
                        MESSAGE:
                        "Document language do not correspond to corpus language"
                    }, HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            docId = corpus.add_text_document(text, title, language, docId,
                                             source)

            self.write_and_set_status({"id": docId}, HTTPStatus.OK)
        except DocumentAlreadyExistsException:
            self.write_and_set_status(
                {MESSAGE: "Document with the same id already exists"},
                HTTPStatus.CONFLICT)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "Internal server error",
                    TRACE: trace
                }, HTTPStatus.INTERNAL_SERVER_ERROR)

示例#14

0

显示文件

    def get(self, corpusId):
        try:
            includeSchemaJson = 'true' == self.get_query_argument(INCLUDE_SCHEMA_JSON, default=False)

            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)
            buckets = get_master_document_corpus_list(envId, authorization).get_corpus(corpusId).get_buckets()
            augmentedBuckets = [getBucketWithSchema(bucket, includeSchemaJson) for bucket in buckets]

            self.write_and_set_status({"buckets": augmentedBuckets},
                                      HTTPStatus.OK)
        except CorpusNotFoundException:
            self.write_and_set_status({MESSAGE: "Specified corpus not found"},
                                      HTTPStatus.NOT_FOUND)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace},
                                      HTTPStatus.INTERNAL_SERVER_ERROR)

示例#15

0

显示文件

文件： corpus_test.py 项目： crim-ca/RACS

 def set_up_corpus(self):
     corpus = get_master_document_corpus_list(
         self.envId, self.authorization).create_corpus("corpus1")
     time.sleep(1)
     bucket1 = corpus.create_bucket("bucket1", "bucket1")
     setting = get_settings()
     self.schemaList = get_schema_list(self.envId, self.authorization)
     schemaNormalId = self.schemaList.add_json_schema_as_hash(SCHEMA_NORMAL)
     schemaOffsetsId = self.schemaList.add_json_schema_as_hash(
         SCHEMA_OFFSETS, False, nestedFields=["offsets"])
     time.sleep(1)
     bucket1.add_or_update_schema_to_bucket(schemaNormalId, "sentence",
                                            TargetType.document_surface1d,
                                            {})
     bucket1.add_or_update_schema_to_bucket(schemaOffsetsId, "token",
                                            TargetType.document_surface1d,
                                            {})
     time.sleep(1)

示例#16

0

显示文件

    def post(self, corpusId):
        try:
            body = json.loads(self.request.body.decode("utf-8"))
            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)
            bucketId = None
            bucketName = None

            if "id" in body:
                bucketId = body["id"]
            if "name" in body:
                bucketName = body["name"]

            if bucketId and not valid_es_id(bucketId):
                self.write_and_set_status(
                    {
                        MESSAGE:
                        "Bucket id invalid '{0}' . BucketId can only be lowercase,alphanumeric with -_"
                        .format(bucketId)
                    }, HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            bucket = get_master_document_corpus_list(envId, authorization). \
                get_corpus(corpusId).create_bucket(bucketName, bucketId)
            self.write_and_set_status({"id": bucket.id}, HTTPStatus.OK)
        except BucketAlreadyExistsException:
            self.write_and_set_status(
                {MESSAGE: "Bucket with the same id already exists"},
                HTTPStatus.CONFLICT)
        except CorpusNotFoundException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Corpus does not exist.Extra info: '{0}'".format(err)
                }, HTTPStatus.UNPROCESSABLE_ENTITY)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "Internal server error",
                    TRACE: trace
                }, HTTPStatus.INTERNAL_SERVER_ERROR)

示例#17

0

显示文件

    def get_documents_zip(self, zipFileName: str = None):
        """
        Creates a zip of all documents of the corpus and returns the path to them.

        :param zipFileName: Name of the created zip file. If not supplied it will be automatically
                generated. If exists, the existing file will be replaced.
        :return: path to the document in thee
        """
        logger = logging.getLogger(__name__)
        self.tmpFileStorage = TmpFileStorage(zipFileName)
        self.tmpFileStorage.create_zip_file()
        es = get_es_conn()
        corpus = get_master_document_corpus_list(
            self.envId, self.authorization).get_corpus(self.corpusId)
        search = Search(using=es, index=corpus.languages_indices())
        search = search.source(["text"])
        search = search.params(scroll=get_scan_scroll_duration(),
                               size=get_nb_documents_per_scan_scroll())

        start = time.time()
        count = 0
        logger.info("Adding documents to zip: {0}".format(self.corpusId))
        for result in search.scan():
            self.tmpFileStorage.add_utf8_file(result.text,
                                              str(result.meta.id) + ".txt")
            count += 1
            if count % NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING == 0:
                end = time.time()
                logger.info(
                    "Time to add documents {0} to {1} : {2} seconds".format(
                        count - NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING, count,
                        end - start))
                start = end

        end = time.time()
        logger.info("Time to add documents {0} to {1} : {2} seconds".format(
            count - count % NB_OF_DOCUMENTS_TO_ADD_BEFORE_LOGGING, count,
            end - start))
        self.tmpFileStorage.close()

        return self.tmpFileStorage.zipPath

示例#18

0

显示文件

文件： corpus.py 项目： crim-ca/RACS

    def get(self, corpusId):
        try:
            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)
            corpora = get_master_document_corpus_list(envId, authorization)
            corpus = corpora.get_corpus(corpusId)
            info = {
                CORPUS_ID: corpus.id,
                CORPUS_LANGUAGES: corpus.languages,
                CORPUS_MODIFICATION_DATE: datetime_to_json_str(corpus.modificationDate),
                CORPUS_DOCUMENT_COUNT: corpus.get_documents_count()
            }
            self.write_and_set_status(info, HTTPStatus.OK)

        except CorpusNotFoundException:
            self.write_and_set_status({MESSAGE: "Specified corpus not found"},
                                      HTTPStatus.NOT_FOUND)

        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace},
                                      HTTPStatus.INTERNAL_SERVER_ERROR)

示例#19

0

显示文件

文件： bucket_schema.py 项目： crim-ca/RACS

    def delete(self, corpusId, bucketId, schemaType):
        try:
            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)

            bucket = get_master_document_corpus_list(
                envId, authorization).get_corpus(corpusId).get_bucket(bucketId)
            schemas = bucket.get_schemas_info(False)
            schemaTypes = [schema['schemaType'] for schema in schemas['data']]
            if not schemaType in schemaTypes:
                self.write_and_set_status(
                    {
                        MESSAGE:
                        "Schema Type: {0} does not exist".format(schemaType)
                    }, HTTPStatus.NOT_FOUND)
                return

            bucket.delete_schema_type(schemaType)
            self.write_and_set_status(None, HTTPStatus.NO_CONTENT)
        except CorpusNotFoundException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Corpus does not exist.Extra info: '{0}'".format(err)
                }, HTTPStatus.NOT_FOUND)
        except BucketNotFoundException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Bucket does not exist.Extra info: '{0}'".format(err)
                }, HTTPStatus.NOT_FOUND)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "Internal server error",
                    TRACE: trace
                }, HTTPStatus.INTERNAL_SERVER_ERROR)

示例#20

0

显示文件

文件： corpus.py 项目： crim-ca/RACS

    def put(self, corpusId):
        try:
            body = self.request.body.decode("utf-8")
            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)
            json_args = json.loads(body)

            try:
                languages = json_args.get(CORPUS_LANGUAGES, None)
                if languages:
                    languageManager = get_language_manager()
                    for language in languages:
                        if not languageManager.has_es_analyser(language):
                            self.write_and_set_status({MESSAGE: "Invalid language: " + language},
                                                      HTTPStatus.UNPROCESSABLE_ENTITY)
                            return
            except Exception as e:
                self.write_and_set_status({MESSAGE: "Invalid languages field: " + str(languages)},
                                          HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            corpora = get_master_document_corpus_list(envId, authorization)
            corpus = corpora.update_corpus(corpusId, languages)
            self.write_and_set_status(None, HTTPStatus.NO_CONTENT)

        except CorpusInvalidFieldException as ci:
            self.write_and_set_status({MESSAGE: "Invalid field: {0}".format(ci)},
                                      HTTPStatus.UNPROCESSABLE_ENTITY)

        except CorpusNotFoundException:
            self.write_and_set_status({MESSAGE: "Specified corpus not found"},
                                      HTTPStatus.NOT_FOUND)

        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status({MESSAGE: "Internal server error", TRACE: trace},
                                      HTTPStatus.INTERNAL_SERVER_ERROR)

示例#21

0

显示文件

文件： test_document.py 项目： crim-ca/RACS

    def test_count_annotations_for_type_basic(self):
        """
            Test annotation count for schemaType indexed as basic instead of noop.
            Note: Not sure it should be permitted at all to allow schemaType with main index different than noop.
        """
        global envIdReadOnly
        global authorizationReadOnly
        schema = json.loads(JSON_SCHEMA_WITH_SCHEMA_TYPE_BASIC)

        corpus = get_master_document_corpus_list(
            envIdReadOnly, authorizationReadOnly).create_corpus()
        bucket = corpus.create_bucket("bucket")
        schema_id = get_schema_list(
            envIdReadOnly, authorizationReadOnly).add_json_schema_as_hash(
                schema, False, nestedFields=["offsets"])
        time.sleep(1)

        schema_type = "CHUNK_ap"
        bucket.add_or_update_schema_to_bucket(schema_id, schema_type,
                                              TargetType("document_surface1d"),
                                              {})
        time.sleep(1)

        annotations = [{
            "_documentID": "98ff06a6-02dd-11e8-b82a-0242ac12001f",
            "_corpusID": "rqgbf20180126",
            "length": 14,
            "string": "contemporaines",
            "schemaType": "CHUNK_ap",
            "offsets": [{
                "end": 449,
                "begin": 435
            }],
        }, {
            "_documentID": "98ff06a6-02dd-11e8-b82a-0242ac12001f",
            "_corpusID": "rqgbf20180126",
            "length": 13,
            "string": "plus anciens,",
            "schemaType": "CHUNK_ap",
            "offsets": [{
                "end": 593,
                "begin": 580
            }],
        }, {
            "_documentID": "98ff06a6-02dd-11e8-b82a-0242ac12001f",
            "_corpusID": "rqgbf20180126",
            "length": 9,
            "string": "coloniale",
            "schemaType": "CHUNK_ap",
            "offsets": [{
                "end": 693,
                "begin": 684
            }],
        }]

        for annotation in annotations:
            bucket.add_annotation(annotation, schema_type)

        time.sleep(1)

        ds = DocumentSearch(envIdReadOnly, authorizationReadOnly, "doc1",
                            corpus.id)
        counts = ds.count_annotations_for_types(bucket.id, [schema_type])
        self.assertEqual(counts[schema_type], len(annotations))

示例#22

0

显示文件

文件： test_document.py 项目： crim-ca/RACS

    def populateData(cls):
        global envIdReadOnly
        global authorizationReadOnly
        # Copy paste from test corpus
        corpus = get_master_document_corpus_list(envIdReadOnly, authorizationReadOnly). \
            create_corpus(CORPUS_ID, languages=["fr-xx", "en-xx"])
        bucket1 = corpus.create_bucket("bucket1", "bucket1")
        bucket2 = corpus.create_bucket("bucket2", "bucket2")

        sentencesS = {
            "$schema": "http://json-schema.org/draft-04/schema#",
            "targetType": "document",
            "schemaType": "sentence",
            "type": "object",
            "required": ["schemaType", "_corpusID", "_documentID", "sentence"],
            "properties": {
                "schemaType": {
                    "type": "string",
                    "description": "Schema type",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "_documentID": {
                    "type": "string",
                    "description": "Internal document GUID",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "_corpusID": {
                    "type": "string",
                    "description": "Internal Corpus GUID",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "sentence": {
                    "type": "string",
                    "description": "Sentence in a document",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                }
            }
        }

        tokenS = {
            "$schema": "http://json-schema.org/draft-04/schema#",
            "targetType": "document_surface1d",
            "schemaType": "token",
            "type": "object",
            "required": ["schemaType", "_corpusID", "_documentID", "sentence"],
            "properties": {
                "schemaType": {
                    "type": "string",
                    "description": "Schema type",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "_documentID": {
                    "type": "string",
                    "description": "Internal document GUID",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "_corpusID": {
                    "type": "string",
                    "description": "Internal Corpus GUID",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "word": {
                    "type": "string",
                    "description": "Word in a document",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                },
                "length": {
                    "type": "integer",
                    "description": "Length of a word",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "category": {
                    "type": "string",
                    "description": "category of the word",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                },
                "offsets": {
                    "searchable": True,
                    "locked": True,
                    "type": "array",
                    "minItems": 1,
                    "items": {
                        "type": "object",
                        "properties": {
                            "begin": {
                                "type": "integer",
                                "minimum": 0
                            },
                            "end": {
                                "type": "integer",
                                "minimum": 0
                            }
                        }
                    }
                }
            }
        }

        tokenwithlemmaS = {
            "$schema": "http://json-schema.org/draft-04/schema#",
            "targetType": "document_surface1d",
            "schemaType": "tokenwithlemma",
            "type": "object",
            "required": ["schemaType", "_corpusID", "_documentID", "sentence"],
            "properties": {
                "schemaType": {
                    "type": "string",
                    "description": "Schema type",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                },
                "_documentID": {
                    "type": "string",
                    "description": "Internal document GUID",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "_corpusID": {
                    "type": "string",
                    "description": "Internal Corpus GUID",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "word": {
                    "type": "string",
                    "description": "Word in a document",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                },
                "length": {
                    "type": "integer",
                    "description": "Length of a word",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "lemma": {
                    "type": "string",
                    "description": "Lemma of a word",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                },
                "category": {
                    "type": "string",
                    "description": "category of the word",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                },
                "offsets": {
                    "searchable": True,
                    "locked": True,
                    "type": "array",
                    "minItems": 1,
                    "items": {
                        "type": "object",
                        "properties": {
                            "begin": {
                                "type": "integer",
                                "minimum": 0
                            },
                            "end": {
                                "type": "integer",
                                "minimum": 0
                            }
                        }
                    }
                }
            }
        }
        schemaList = get_schema_list(envIdReadOnly, authorizationReadOnly)
        sentenceSID = schemaList.add_json_schema(jsonSchema=sentencesS)
        tokenSID = schemaList.add_json_schema(jsonSchema=tokenS,
                                              nestedFields="offsets")
        tokenwithlemmaSID = schemaList.add_json_schema(
            jsonSchema=tokenwithlemmaS, nestedFields="offsets")

        bucket1.add_or_update_schema_to_bucket(
            sentenceSID, sentencesS["schemaType"],
            TargetType(sentencesS["targetType"]), {})
        bucket1.add_or_update_schema_to_bucket(
            tokenSID, tokenS["schemaType"], TargetType(tokenS["targetType"]),
            {})
        bucket1.add_or_update_schema_to_bucket(
            tokenwithlemmaSID, tokenwithlemmaS["schemaType"],
            TargetType(tokenwithlemmaS["targetType"]), {})
        bucket2.add_or_update_schema_to_bucket(
            tokenSID, tokenS["schemaType"], TargetType(tokenS["targetType"]),
            {})
        bucket2.add_or_update_schema_to_bucket(
            tokenwithlemmaSID, tokenwithlemmaS["schemaType"],
            TargetType(tokenwithlemmaS["targetType"]), {})
        time.sleep(1)
        #  sentences
        # bucket 1
        bucket1.add_annotation(
            {
                "_documentID":
                "doc1",
                "_corpusID":
                CORPUS_ID,
                "schemaType":
                "sentence",
                "sentence":
                "Les algorithmes de colonies de fourmis sont des algorithmes inspirés du comportement des fourmis."
            }, "sentence")

        bucket1.add_annotation(
            {
                "_documentID":
                ALICE_FR_DOC_ID,
                "_corpusID":
                CORPUS_ID,
                "schemaType":
                "sentence",
                "sentence":
                "Le café liégeois doit son appellation à la résistance de l’armée belge lors de la bataille des forts de Liège d’août 1914."
            }, "sentence")

        # bucket1
        # token
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "Les",
                "offsets": [{
                    "begin": 0,
                    "end": 3
                }],
                "length": 3,
                "category": "DET:ART"
            }, "token")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "algorithmes",
                "offsets": [{
                    "begin": 4,
                    "end": 15
                }],
                "length": 11,
                "category": "NOM"
            }, "token")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "de",
                "offsets": [{
                    "begin": 28,
                    "end": 30
                }, {
                    "begin": 16,
                    "end": 18
                }],
                "length": 2,
                "category": "PRP"
            }, "token")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "colonies",
                "offsets": [{
                    "begin": 19,
                    "end": 27
                }],
                "length": 8,
                "category": "NOM"
            }, "token")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "fourmis",
                "offsets": [{
                    "begin": 31,
                    "end": 38
                }],
                "length": 7,
                "category": "NOM"
            }, "token")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "sont",
                "offsets": [{
                    "begin": 39,
                    "end": 43
                }],
                "length": 4,
                "category": "VER:pres"
            }, "token")
        # some doc 2 annotations
        bucket1.add_annotation(
            {
                "_documentID": "doc2",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "des",
                "offsets": [{
                    "begin": 44,
                    "end": 47
                }],
                "length": 3,
                "category": "PRP:det"
            }, "token")
        bucket1.add_annotation(
            {
                "_documentID": "doc2",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "algorithmes",
                "offsets": [{
                    "begin": 48,
                    "end": 59
                }],
                "length": 11,
                "category": "NOM"
            }, "token")

        # tokenwithlemma
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "Le",
                "offsets": [{
                    "begin": 98,
                    "end": 100
                }],
                "length": 2,
                "lemma": "le",
                "category": "DET:ART"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "café",
                "offsets": [{
                    "begin": 101,
                    "end": 105
                }],
                "length": 4,
                "lemma": "café",
                "category": "NOM"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "liégeois",
                "offsets": [{
                    "begin": 106,
                    "end": 114
                }],
                "length": 8,
                "lemma": "liégeois",
                "category": "ADJ"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "doit",
                "offsets": [{
                    "begin": 115,
                    "end": 119
                }],
                "length": 4,
                "lemma": "devoir",
                "category": "VER:pres"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "son",
                "offsets": [{
                    "begin": 120,
                    "end": 123
                }],
                "length": 3,
                "lemma": "son",
                "category": "DET:POS"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "appellation",
                "offsets": [{
                    "begin": 124,
                    "end": 135
                }],
                "length": 11,
                "lemma": "appellation",
                "category": "NOM"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "à",
                "offsets": [{
                    "begin": 136,
                    "end": 137
                }],
                "length": 1,
                "lemma": "à",
                "category": "PRP"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "la",
                "offsets": [{
                    "begin": 138,
                    "end": 140
                }],
                "length": 2,
                "lemma": "le",
                "category": "DET:ART"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "résistance",
                "offsets": [{
                    "begin": 141,
                    "end": 151
                }],
                "length": 10,
                "lemma": "résistance",
                "category": "NOM"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "de",
                "offsets": [{
                    "begin": 152,
                    "end": 154
                }],
                "length": 2,
                "lemma": "de",
                "category": "PRP"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "l",
                "offsets": [{
                    "begin": 155,
                    "end": 156
                }],
                "length": 1,
                "lemma": None,
                "category": "NOM"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "armée",
                "offsets": [{
                    "begin": 157,
                    "end": 162
                }],
                "length": 5,
                "lemma": "armer",
                "category": "VER:pper"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "belge",
                "offsets": [{
                    "begin": 163,
                    "end": 168
                }],
                "length": 5,
                "lemma": "belge",
                "category": "ADJ"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "lors",
                "offsets": [{
                    "begin": 169,
                    "end": 173
                }],
                "length": 4,
                "lemma": "lors",
                "category": "ADV"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "de",
                "offsets": [{
                    "begin": 174,
                    "end": 176
                }],
                "length": 2,
                "lemma": "de",
                "category": "PRP"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "la",
                "offsets": [{
                    "begin": 177,
                    "end": 179
                }],
                "length": 2,
                "lemma": "le",
                "category": "DET:ART"
            }, "tokenwithlemma")
        bucket1.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "bataille",
                "offsets": [{
                    "begin": 180,
                    "end": 188
                }],
                "length": 8,
                "lemma": "bataille",
                "category": "NOM"
            }, "tokenwithlemma")

        # bucket2
        # token
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "algorithmes",
                "offsets": [{
                    "begin": 48,
                    "end": 59
                }],
                "length": 11,
                "category": "NOM"
            }, "token")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "inspirés",
                "offsets": [{
                    "begin": 60,
                    "end": 68
                }],
                "length": 8,
                "category": "VER:pper"
            }, "token")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "du",
                "offsets": [{
                    "begin": 69,
                    "end": 71
                }],
                "length": 2,
                "category": "PRP:det"
            }, "token")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "comportement",
                "offsets": [{
                    "begin": 72,
                    "end": 84
                }],
                "length": 12,
                "category": "NOM"
            }, "token")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "des",
                "offsets": [{
                    "begin": 85,
                    "end": 88
                }],
                "length": 3,
                "category": "PRP:det"
            }, "token")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": "fourmis",
                "offsets": [{
                    "begin": 89,
                    "end": 96
                }],
                "length": 7,
                "category": "NOM"
            }, "token")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "token",
                "word": ".",
                "offsets": [{
                    "begin": 96,
                    "end": 97
                }],
                "length": 1,
                "category": "SENT"
            }, "token")

        # tokenwithlemma
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "armée",
                "offsets": [{
                    "begin": 157,
                    "end": 162
                }],
                "length": 5,
                "lemma": "armer",
                "category": "VER:pper"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "belge",
                "offsets": [{
                    "begin": 163,
                    "end": 168
                }],
                "length": 5,
                "lemma": "belge",
                "category": "ADJ"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "lors",
                "offsets": [{
                    "begin": 169,
                    "end": 173
                }],
                "length": 4,
                "lemma": "lors",
                "category": "ADV"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "de",
                "offsets": [{
                    "begin": 174,
                    "end": 176
                }],
                "length": 2,
                "lemma": "de",
                "category": "PRP"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "la",
                "offsets": [{
                    "begin": 177,
                    "end": 179
                }],
                "length": 2,
                "lemma": "le",
                "category": "DET:ART"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "bataille",
                "offsets": [{
                    "begin": 180,
                    "end": 188
                }],
                "length": 8,
                "lemma": "bataille",
                "category": "NOM"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "des",
                "offsets": [{
                    "begin": 189,
                    "end": 192
                }],
                "length": 3,
                "lemma": "du",
                "category": "PRP:det"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "forts",
                "offsets": [{
                    "begin": 193,
                    "end": 198
                }],
                "length": 5,
                "lemma": "fort",
                "category": "NOM"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "de",
                "offsets": [{
                    "begin": 199,
                    "end": 201
                }],
                "length": 2,
                "lemma": "de",
                "category": "PRP"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "Liège",
                "offsets": [{
                    "begin": 202,
                    "end": 207
                }],
                "length": 5,
                "lemma": "Liège",
                "category": "NAM"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "d",
                "offsets": [{
                    "begin": 208,
                    "end": 209
                }],
                "length": 1,
                "lemma": None,
                "category": "VER:futu"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "août",
                "offsets": [{
                    "begin": 210,
                    "end": 214
                }],
                "length": 4,
                "lemma": "août",
                "category": "NOM"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": "1914",
                "offsets": [{
                    "begin": 215,
                    "end": 219
                }],
                "length": 4,
                "lemma": "@card@",
                "category": "NUM"
            }, "tokenwithlemma")
        bucket2.add_annotation(
            {
                "_documentID": "doc1",
                "_corpusID": CORPUS_ID,
                "schemaType": "tokenwithlemma",
                "word": ".",
                "offsets": [{
                    "begin": 219,
                    "end": 220
                }],
                "length": 1,
                "lemma": ".",
                "category": "SENT"
            }, "tokenwithlemma")
        time.sleep(1)

        corpus.add_text_document(
            id=ALICE_FR_DOC_ID,
            language="fr-xx",
            title="AU FOND DU TERRIER",
            source="https://www.gutenberg.org/files/55456/55456-0.txt",
            text=
            "ALICE, assise auprès de sa sœur sur le gazon, commençait à s'ennuyer de rester là à ne rien faire; "
            "une ou deux fois elle avait jeté les yeux sur le livre que lisait sa sœur; mais quoi! pas d'images, "
            "pas de dialogues! \"La belle avance,\" pensait Alice, \"qu'un livre sans images, sans causeries!\"."
        )

        corpus.add_text_document(
            id=ALICE_EN_DOC_ID,
            language="en-xx",
            title="Down the Rabbit-Hole",
            source="http://www.gutenberg.org/files/11/11-0.txt",
            text=
            "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing "
            "to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or "
            "conversations in it, ‘and what is the use of a book,’ thought Alice ‘without pictures or "
            "conversations?’")
        time.sleep(1)

示例#23

0

显示文件

文件： test_bucket.py 项目： crim-ca/RACS

    def test_get_schemas_info(self):
        jsonSchema1 = {
            "$schema": "http://json-schema.org/draft-04/schema#",
            "targetType": "document_surface1d",
            "schemaType": "schema1",
            "type": "object",
            "required": ["_schemaType", "_corpusID", "_documentID", "offsets"],
            "properties": {
                "_schemaType": {
                    "type": "string",
                    "description": "Schema type",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                },
                "_documentID": {
                    "type": "string",
                    "description": "Internal document GUID",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                },
                "_corpusID": {
                    "type": "string",
                    "description": "Internal Corpus GUID",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                }
            }
        }

        jsonSchema2 = {
            "$schema": "http://json-schema.org/draft-04/schema#",
            "targetType": "document_surface1d",
            "schemaType": "schema2",
            "type": "object",
            "required": [
                "_schemaType",
            ],
            "properties": {
                "_schemaType": {
                    "type": "string",
                    "description": "Schema type",
                    "searchable": True,
                    "searchModes": ["noop"],
                    "locked": True
                }
            }
        }

        corpus = get_master_document_corpus_list(
            self.envId, self.authorization).create_corpus("corpus1")
        bucket1 = corpus.create_bucket("bucket1")
        schemaId1 = get_schema_list(
            self.envId, self.authorization).add_json_schema_as_hash(
                jsonSchema1, False, {})
        schemaId2 = get_schema_list(
            self.envId, self.authorization).add_json_schema_as_hash(
                jsonSchema2, False, {})
        time.sleep(1)

        bucket1.add_or_update_schema_to_bucket(schemaId1, "schema1",
                                               TargetType("document"), {})
        bucket1.add_or_update_schema_to_bucket(schemaId2, "schema2",
                                               TargetType("document"), {})
        time.sleep(1)
        res = bucket1.get_schemas_info(True)
        self.assertEqual(len(res["data"]), 2)

示例#24

0

显示文件

文件： test_bucket.py 项目： crim-ca/RACS

    def test_delete_schema_type(self):
        jsonSchema1 = {
            "$schema": "http://json-schema.org/draft-04/schema#",
            "targetType": "document_surface1d",
            "schemaType": "schema1",
            "type": "object",
            "properties": {
                "name": {
                    "type": "string",
                    "description": "Internal document GUID",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                }
            }
        }

        jsonSchema2 = {
            "$schema": "http://json-schema.org/draft-04/schema#",
            "targetType": "document_surface1d",
            "schemaType": "schema2",
            "type": "object",
            "properties": {
                "city": {
                    "type": "string",
                    "description": "Internal document GUID",
                    "searchable": True,
                    "searchModes": ["basic"],
                    "locked": True
                }
            }
        }

        corpus = get_master_document_corpus_list(
            self.envId, self.authorization).create_corpus("corpus1")
        bucket1 = corpus.create_bucket("bucket1")
        schemaId1 = get_schema_list(
            self.envId, self.authorization).add_json_schema_as_hash(
                jsonSchema1, False, {})
        schemaId2 = get_schema_list(
            self.envId, self.authorization).add_json_schema_as_hash(
                jsonSchema2, False, {})
        time.sleep(1)

        bucket1.add_or_update_schema_to_bucket(schemaId1, "schema1",
                                               TargetType("document"), {})
        bucket1.add_or_update_schema_to_bucket(schemaId2, "schema2",
                                               TargetType("document"), {})
        bucket1.add_annotation({"name": "Anton"}, "schema1", "1")
        bucket1.add_annotation({"name": "JF"}, "schema1", "2")
        bucket1.add_annotation({"city": "Montreal"}, "schema2", "1")
        bucket1.add_annotation({"city": "Quebec"}, "schema2", "2")
        time.sleep(1)

        anno1 = bucket1.get_annotation("1", "schema1")

        bucket1.delete_schema_type("schema1")
        time.sleep(1)

        info = bucket1.get_schemas_info()
        self.assertEqual(len(info["data"]), 1)
        # making shure the annotation remains in schema 2
        anno1 = bucket1.get_annotation("1", "schema2")
        self.assertEqual(anno1["city"], "Montreal")
        anno2 = bucket1.get_annotation("2", "schema2")
        self.assertEqual(anno2["city"], "Quebec")

        # create the same schema type but with different data
        bucket1.add_or_update_schema_to_bucket(schemaId1, "schema1",
                                               TargetType("document"), {})
        bucket1.add_annotation({"name": "Yolo"}, "schema1", "3")
        bucket1.add_annotation({"name": "Rage"}, "schema1", "4")
        time.sleep(1)
        # make sure old annotations do not exists
        anno1 = bucket1.get_annotation("3", "schema1")
        self.assertEqual(anno1["name"], "Yolo")
        anno2 = bucket1.get_annotation("4", "schema1")
        self.assertEqual(anno2["name"], "Rage")
        self.assertRaises(DocumentNotFoundException, bucket1.get_annotation,
                          "1", "schema1")

示例#25

0

显示文件

文件： bucket_schema.py 项目： crim-ca/RACS

    def put(self, corpusId, bucketId):
        try:
            body = self.strip_body_bom()
            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)

            if is_missing_required_fields(
                    body, ["targetType", "schemaType", "properties"]):
                self.write_and_set_status(
                    {
                        MESSAGE:
                        missing_fields_message(
                            body, ["targetType", "schemaType", "properties"])
                    }, HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            schemaType = body["schemaType"]
            targetTypeName = body["targetType"]
            if not TargetType.has(targetTypeName):
                self.write_and_set_status(
                    {
                        MESSAGE:
                        "Target type {0} not supported".format(targetTypeName)
                    }, HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            # Is there currently a schema of schemaType associated with the bucket?
            bucket = get_master_document_corpus_list(
                envId, authorization).get_corpus(corpusId).get_bucket(bucketId)
            schemas = bucket.get_schemas_info(False)
            schemaTypes = [schema['schemaType'] for schema in schemas['data']]
            if schemaType not in schemaTypes:
                self.write_and_set_status(
                    {
                        MESSAGE:
                        "There is no schema with the schemaType '{0}' currently bound to the bucket."
                        .format(schemaType)
                    }, HTTPStatus.NOT_FOUND)
                return

            # check if schema with the same has as the current annotation exist:
            targetType = TargetType(targetTypeName)
            nestedFields = []
            if targetType == TargetType.document_surface1d:
                nestedFields.append("offsets")
            schemaId = get_schema_list(envId,
                                       authorization).add_json_schema_as_hash(
                                           body, False, nestedFields)
            bucket.add_or_update_schema_to_bucket(schemaId, schemaType,
                                                  targetType, {})

            self.write_and_set_status(None, HTTPStatus.NO_CONTENT)
        except EsSchemaMigrationInvalidException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Cannot update schema because changes are not compatible with document in old schema.Extra info: '{0}'"
                    .format(err)
                }, HTTPStatus.UNPROCESSABLE_ENTITY)
        except EsSchemaMigrationDeleteFieldsNotSupportedException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Can not delete fields from existing schema.Missing Fields: '{0}'"
                    .format(err)
                }, HTTPStatus.UNPROCESSABLE_ENTITY)
        except CorpusNotFoundException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Corpus does not exist.Extra info: '{0}'".format(err)
                }, HTTPStatus.UNPROCESSABLE_ENTITY)
        except BucketNotFoundException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Bucket does not exist.Extra info: '{0}'".format(err)
                }, HTTPStatus.UNPROCESSABLE_ENTITY)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "Internal server error",
                    TRACE: trace
                }, HTTPStatus.INTERNAL_SERVER_ERROR)

示例#26

0

显示文件

文件： bucket_schema.py 项目： crim-ca/RACS

    def post(self, corpusId, bucketId):
        try:
            body = self.strip_body_bom()
            envId = get_env_id()
            authorization = get_autorisation(envId, None, None)

            if is_missing_required_fields(
                    body, ["targetType", "schemaType", "properties"]):
                self.write_and_set_status(
                    {
                        MESSAGE:
                        missing_fields_message(
                            body, ["targetType", "schemaType", "properties"])
                    }, HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            schemaType = body["schemaType"]
            targetTypeName = body["targetType"]
            if not TargetType.has(targetTypeName):
                self.write_and_set_status(
                    {
                        MESSAGE:
                        "Target type {0} not supported".format(targetTypeName)
                    }, HTTPStatus.UNPROCESSABLE_ENTITY)
                return

            bucket = get_master_document_corpus_list(
                envId, authorization).get_corpus(corpusId).get_bucket(bucketId)
            schemas = bucket.get_schemas_info(False)
            schemaTypes = [schema['schemaType'] for schema in schemas['data']]
            if schemaType in schemaTypes:
                self.write_and_set_status(
                    {
                        MESSAGE:
                        "A schema with the schemaType '{0}' is already bound to the bucket."
                        .format(schemaType)
                    }, HTTPStatus.FORBIDDEN)
                return

            # check if schema with the same has as the current annotation exist:
            targetType = TargetType(targetTypeName)
            nestedFields = []
            if targetType == TargetType.document_surface1d:
                nestedFields.append("offsets")
            schemaId = get_schema_list(envId,
                                       authorization).add_json_schema_as_hash(
                                           body, False, nestedFields)
            bucket.add_or_update_schema_to_bucket(schemaId, schemaType,
                                                  targetType, {})

            self.write_and_set_status(None, HTTPStatus.NO_CONTENT)
        except CorpusNotFoundException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Corpus does not exist.Extra info: '{0}'".format(err)
                }, HTTPStatus.UNPROCESSABLE_ENTITY)
        except BucketNotFoundException as err:
            self.write_and_set_status(
                {
                    MESSAGE:
                    "Bucket does not exist.Extra info: '{0}'".format(err)
                }, HTTPStatus.UNPROCESSABLE_ENTITY)
        except SchemaBindingInvalid as err:
            self.write_and_set_status(
                {MESSAGE: "Schema Binding Invalid: '{0}'".format(err)},
                HTTPStatus.UNPROCESSABLE_ENTITY)
        except Exception:
            trace = traceback.format_exc().splitlines()
            self.write_and_set_status(
                {
                    MESSAGE: "Internal server error",
                    TRACE: trace
                }, HTTPStatus.INTERNAL_SERVER_ERROR)