示例#1
0
def audit_phase_2(audit, kind=None):
    ctx = AuditContext(audit["subsidiary"]["name"])

    print(f'.....processing audit {audit["_id"]}')

    document_ids = get_docs_by_audit_id(
        audit["_id"],
        states=[DocumentState.Preprocessed.value, DocumentState.Error.value],
        kind=kind,
        id_only=True)

    _charter_ids = audit.get("charters", [])
    document_ids.extend(_charter_ids)

    for k, document_id in enumerate(document_ids):
        _document = finalizer.get_doc_by_id(document_id)
        jdoc = DbJsonDoc(_document)

        processor: BaseProcessor = document_processors.get(jdoc.documentType)
        if processor is None:
            logger.warning(
                f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}'
            )
        else:
            if need_analysis(jdoc) and jdoc.isPreprocessed():
                logger.info(
                    f'.....processing  {k} of {len(document_ids)}   {jdoc.documentType} {document_id}'
                )
                processor.process(jdoc, audit, ctx)

    change_audit_status(audit,
                        "Finalizing")  # TODO: check ALL docs in proper state
示例#2
0
def audit_phase_1(audit, kind=None):
    logger.info(f'.....processing audit {audit["_id"]}')
    ctx = AuditContext(audit["subsidiary"]["name"])

    document_ids = get_docs_by_audit_id(audit["_id"],
                                        states=[DocumentState.New.value],
                                        kind=kind,
                                        id_only=True)
    _charter_ids = audit.get("charters", [])
    document_ids.extend(_charter_ids)

    for k, document_id in enumerate(document_ids):
        _document = finalizer.get_doc_by_id(document_id)
        jdoc = DbJsonDoc(_document)

        processor: BaseProcessor = document_processors.get(jdoc.documentType)
        if processor is None:
            logger.warning(
                f'unknown/unsupported doc type: {jdoc.documentType}, cannot process {document_id}'
            )
        else:
            logger.info(
                f'......pre-processing {k} of {len(document_ids)}  {jdoc.documentType}:{document_id}'
            )
            if need_analysis(jdoc) and jdoc.isNew():
                processor.preprocess(jdoc=jdoc, context=ctx)
示例#3
0
    def embedd_large(self, text_map, max_tokens=6000, log_addon=''):
        elmo_logger.info(
            f'{log_addon} {len(text_map)} max_tokens={max_tokens}')
        overlap = max_tokens // 20

        number_of_windows = 1 + len(text_map) // max_tokens
        window = max_tokens

        msg = f"{log_addon} Document is too large for embedding: {len(text_map)} tokens. Splitting into {number_of_windows} windows overlapping with {overlap} tokens "
        elmo_logger.warning(msg)

        start = 0
        embeddings = None
        # tokens = []
        while start < len(text_map):

            subtokens: Tokens = text_map[start:start + window + overlap]
            elmo_logger.debug(
                f"{log_addon} Embedding region: {start}, {len(subtokens)}")

            sub_embeddings = self.embedd_tokens(subtokens)[0:window]

            if embeddings is None:
                embeddings = sub_embeddings
            else:
                embeddings = np.concatenate([embeddings, sub_embeddings])

            start += window

        return embeddings
示例#4
0
    def test_get_docs_by_audit_id(self):
        audits = get_audits()
        if len(audits) == 0:
            logger.warning('no audits')
            return

        audit_id = audits[0]['_id']

        docs = get_docs_by_audit_id(audit_id, kind='PROTOCOL')
        for a in docs:
            print(a['_id'], a['filename'])
示例#5
0
    def test_process_charters_phase_1(self):
        audits = get_audits()
        if len(audits) == 0:
            logger.warning('no audits')
            return

        audit_id = audits[0]['_id']
        docs: [dict] = get_docs_by_audit_id(audit_id, kind='CHARTER')
        processor = document_processors.get('CHARTER')
        for _doc in docs:
            jdoc = DbJsonDoc(_doc)
            processor.preprocess(jdoc, AuditContext())
示例#6
0
    def test_process_contracts_phase_1(self):
        # runner = Runner.get_instance()

        audits = get_audits()
        if len(audits) == 0:
            logger.warning('no audits')
            return

        audit_id = audits[0]['_id']

        docs = get_docs_by_audit_id(audit_id, kind='CONTRACT')
        processor = document_processors.get('CONTRACT')
        for _doc in docs:
            jdoc = DbJsonDoc(_doc)
            processor.preprocess(jdoc, AuditContext())
示例#7
0
    def test_get_org_names(self):
        parser = CharterParser()

        audits = get_audits()
        if len(audits) == 0:
            logger.warning('no audits')
            return

        audit_id = audits[0]['_id']
        docs = get_docs_by_audit_id(audit_id, kind=CHARTER)

        for db_document in docs:
            print(db_document['filename'])

            parsed_p_json = db_document['parse']
            charter: CharterDocument = join_paragraphs(
                parsed_p_json, doc_id=db_document['_id'])

            # TODO: mind, this could be slow if embedding is required
            parser.find_org_date_number(charter, AuditContext())

            for tag in charter.get_tags():
                print(tag)