예제 #1
0
def sync_document_terms(db_session=None):
    """Performs term extraction from known documents."""
    documents = get_all(db_session=db_session)

    for doc in documents:
        log.debug(f"Processing document. Name: {doc.name}")
        p = plugin_service.get_active(db_session=db_session,
                                      plugin_type="storage")

        try:
            if "sheet" in doc.resource_type:
                mime_type = "text/csv"
            else:
                mime_type = "text/plain"

            doc_text = p.instance.get(doc.resource_id, mime_type)
            extracted_terms = route_service.get_terms(db_session=db_session,
                                                      text=doc_text)

            matched_terms = (db_session.query(Term).filter(
                func.upper(Term.text).in_(
                    [func.upper(t) for t in extracted_terms])).all())

            log.debug(
                f"Extracted the following terms from {doc.weblink}. Terms: {extracted_terms}"
            )

            if matched_terms:
                doc.terms = matched_terms
                db_session.commit()

        except Exception as e:
            # even if one document fails we don't want them to all fail
            log.exception(e)
예제 #2
0
def sync_document_terms(db_session=None):
    """Performs term extraction from known documents."""
    documents = get_all(db_session=db_session)

    for doc in documents:
        log.debug(f"Processing document. Name: {doc.name}")
        p = plugins.get(
            INCIDENT_PLUGIN_STORAGE_SLUG
        )  # this may need to be refactored if we support multiple document types

        try:
            if "sheet" in doc.resource_type:
                mime_type = "text/csv"
            else:
                mime_type = "text/plain"

            doc_text = p.get(doc.resource_id, mime_type)
            extracted_terms = route_service.get_terms(
                db_session=db_session, model=Term, text=doc_text
            )

            matched_terms = (
                db_session.query(Term)
                .filter(func.upper(Term.text).in_([func.upper(t) for t in extracted_terms]))
                .all()
            )

            log.debug(f"Extracted the following terms from {doc.weblink}. Terms: {extracted_terms}")

            if matched_terms:
                doc.terms = matched_terms
                db_session.commit()

        except Exception as e:
            # even if one document fails we don't want them to all fail
            sentry_sdk.capture_exception(e)
            log.exception(e)