Exemplo n.º 1
0
def stop_matched_holdingpen_wfs(obj, eng):
    """Stop the matched workflow objects in the holdingpen.

    Stops the matched workflows in the holdingpen by replacing their steps with
    a new one defined on the fly, containing a ``stop`` step, and executing it.
    For traceability reason, these workflows are also marked as
    ``'stopped-by-wf'``, whose value is the current workflow's id.

    In the use case of harvesting twice an article, this function is involved
    to stop the first workflow and let the current one being processed,
    since it the latest metadata.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None
    """
    stopping_steps = [mark('stopped-by-wf', int(obj.id)), stop_processing]

    save_workflow(obj, eng)

    for holdingpen_wf_id in obj.extra_data['holdingpen_matches']:
        holdingpen_wf = workflow_object_class.get(holdingpen_wf_id)
        holdingpen_wf_eng = WorkflowEngine.from_uuid(holdingpen_wf.id_workflow)

        # stop this holdingpen workflow by replacing its steps with a stop step
        holdingpen_wf_eng.callbacks.replace(stopping_steps)
        holdingpen_wf_eng.process([holdingpen_wf])
Exemplo n.º 2
0
def stop_matched_holdingpen_wfs(obj, eng):
    """Stop the matched workflow objects in the holdingpen.

    Stops the matched workflows in the holdingpen by replacing their steps with
    a new one defined on the fly, containing a ``stop`` step, and executing it.
    For traceability reason, these workflows are also marked as
    ``'stopped-by-wf'``, whose value is the current workflow's id.

    In the use case of harvesting twice an article, this function is involved
    to stop the first workflow and let the current one being processed,
    since it the latest metadata.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None
    """
    stopping_steps = [mark('stopped-by-wf', int(obj.id)), stop_processing]

    obj.save()

    for holdingpen_wf_id in obj.extra_data['holdingpen_matches']:
        holdingpen_wf = workflow_object_class.get(holdingpen_wf_id)
        holdingpen_wf_eng = WorkflowEngine.from_uuid(holdingpen_wf.id_workflow)

        # stop this holdingpen workflow by replacing its steps with a stop step
        holdingpen_wf_eng.callbacks.replace(stopping_steps)
        holdingpen_wf_eng.process([holdingpen_wf])
def test_mark_overwrites():
    obj = MockObj({}, {'foo': 'bar'})
    eng = MockEng()

    foobaz_mark = mark('foo', 'baz')

    assert foobaz_mark(obj, eng) is None
    assert obj.extra_data == {'foo': 'baz'}
def test_mark():
    obj = MockObj({}, {})
    eng = MockEng()

    foobar_mark = mark('foo', 'bar')

    assert foobar_mark(obj, eng) is None
    assert obj.extra_data == {'foo': 'bar'}
def test_mark_overwrites():
    obj = MockObj({}, {'foo': 'bar'})
    eng = MockEng()

    foobaz_mark = mark('foo', 'baz')

    assert foobaz_mark(obj, eng) is None
    assert obj.extra_data == {'foo': 'baz'}
def test_mark():
    obj = MockObj({}, {})
    eng = MockEng()

    foobar_mark = mark('foo', 'bar')

    assert foobar_mark(obj, eng) is None
    assert obj.extra_data == {'foo': 'bar'}
Exemplo n.º 7
0
            ticket_id_key="ticket_id"
        ),
    ),
    do_not_repeat('reply_ticket_user_new_submission')(
        reply_ticket(
            template="literaturesuggest/tickets/user_submitted.html",
            context_factory=reply_ticket_context,
            keep_new=True
        ),
    )
]

CHECK_AUTO_APPROVE = [
    IF_ELSE(
        is_submission,
        mark('auto-approved', False),
        IF_ELSE(
            auto_approve,
            [
                mark('auto-approved', True),
                set_core_in_extra_data,
            ],
            mark('auto-approved', False),
        ),
    ),
]

ENHANCE_RECORD = [
    IF(
        is_arxiv_paper,
        [
Exemplo n.º 8
0
NOTIFY_SUBMISSION = [
    do_not_repeat('create_ticket_curator_new_submission')(create_ticket(
        template="literaturesuggest/tickets/curator_submitted.html",
        queue="HEP_add_user",
        context_factory=new_ticket_context,
        ticket_id_key="ticket_id"), ),
    do_not_repeat('reply_ticket_user_new_submission')(reply_ticket(
        template="literaturesuggest/tickets/user_submitted.html",
        context_factory=reply_ticket_context,
        keep_new=True), )
]

CHECK_AUTO_APPROVE = [
    IF_ELSE(
        is_submission,
        mark('auto-approved', False),
        IF_ELSE(
            auto_approve,
            [
                mark('auto-approved', True),
                set_core_in_extra_data,
            ],
            mark('auto-approved', False),
        ),
    ),
]

ENHANCE_RECORD = [
    IF(is_arxiv_paper, [
        populate_arxiv_document,
        arxiv_package_download,
Exemplo n.º 9
0
class Article(object):
    """Article ingestion workflow for Literature collection."""
    name = "HEP"
    data_type = "hep"

    workflow = [
        # Make sure schema is set for proper indexing in Holding Pen
        set_schema,
        # Emit record signals to receive metadata enrichment
        emit_record_signals,
        # Query locally or via legacy search API to see if article
        # is already ingested and this is an update
        IF(article_exists, [
            mark('match-found', True),
        ]),
        IF_ELSE(
            is_submission,
            [
                # Article matching for submissions
                # ================================
                IF(pending_in_holding_pen, [
                    mark('already-in-holding-pen', True),
                ]),
                # Special RT integration for submissions
                # ======================================
                create_ticket(
                    template="literaturesuggest/tickets/curator_submitted.html",
                    queue="HEP_add_user",
                    context_factory=new_ticket_context,
                    ticket_id_key="ticket_id"),
                reply_ticket(
                    template="literaturesuggest/tickets/user_submitted.html",
                    context_factory=reply_ticket_context,
                    keep_new=True),
            ],
            [
                # Article matching for non-submissions
                # ====================================
                # Query holding pen to see if we already have this article ingested
                #
                # NOTE on updates:
                #     If the same article has been harvested before and the
                #     ingestion has been completed, process is continued
                #     to allow for updates.
                IF(pending_in_holding_pen, [
                    mark('already-in-holding-pen', True),
                    mark('delete', True),
                ]),
                IF(
                    is_arxiv_paper,
                    [
                        # FIXME: This filtering step should be removed when this
                        #        workflow includes arXiv CORE harvesting
                        IF(already_harvested, [
                            mark('already-ingested', True),
                            mark('stop', True),
                        ]),
                        # FIXME: This filtering step should be removed when:
                        #        old previously rejected records are treated
                        #        differently e.g. good auto-reject heuristics or better
                        #        time based filtering (5 days is quite random now).
                        IF(previously_rejected(), [
                            mark('already-ingested', True),
                            mark('stop', True),
                        ]),
                    ]),
                IF(is_marked('delete'),
                   [update_old_object, delete_self_and_stop_processing]),
                IF(is_marked('stop'), [stop_processing]),
            ]),
        #
        # Article Processing
        # ==================
        IF(is_arxiv_paper, [
            arxiv_fulltext_download,
            arxiv_plot_extract,
            arxiv_refextract,
            arxiv_author_list("authorlist2marcxml.xsl"),
        ]),
        extract_journal_info,
        classify_paper(
            taxonomy="HEPont.rdf",
            only_core_tags=False,
            spires=True,
            with_author_keywords=True,
        ),
        filter_core_keywords,
        guess_categories,
        IF(is_experimental_paper, [
            guess_experiments,
        ]),
        guess_keywords,
        # Predict action for a generic HEP paper based only on title
        # and abstract.
        guess_coreness,  # ("arxiv_skip_astro_title_abstract.pickle)
        # Check if we shall halt or auto-reject
        # =====================================
        # NOTE: User submissions are always relevant
        IF_ELSE(is_record_relevant, [
            halt_record(action="hep_approval"),
        ], [reject_record("Article automatically rejected"), stop_processing]),
        IF_ELSE(is_record_accepted, [
            IF(article_exists, [
                IF_ELSE(is_submission, [
                    reject_record('Article was already found on INSPIRE'),
                    stop_processing,
                    reply_ticket(
                        template=
                        "literaturesuggest/tickets/user_rejected_exists.html",
                        context_factory=reply_ticket_context),
                    close_ticket(ticket_id_key="ticket_id"),
                ], [
                    halt_record(action="merge_approval"),
                ]),
            ]),
            add_core,
            add_note_entry,
            filter_keywords,
            user_pdf_get,
            IF_ELSE(shall_push_remotely, [
                IF_ELSE(article_exists, [
                    prepare_update_payload(extra_data_key="update_payload"),
                    send_robotupload(marcxml_processor=hep2marc,
                                     mode="correct",
                                     extra_data_key="update_payload"),
                ], [
                    send_robotupload(marcxml_processor=hep2marc,
                                     mode="insert"),
                ])
            ], [store_record]),
            IF(is_submission, [
                IF(curation_ticket_needed, [
                    create_ticket(
                        template="literaturesuggest/tickets/curation_core.html",
                        queue="HEP_curation",
                        context_factory=curation_ticket_context,
                        ticket_id_key="curation_ticket_id")
                ]),
                reply_ticket(
                    template="literaturesuggest/tickets/user_accepted.html",
                    context_factory=reply_ticket_context),
            ]),
        ], [
            IF(is_submission,
               [reply_ticket(context_factory=reply_ticket_context)])
        ]),
        close_ticket(ticket_id_key="ticket_id")
    ]
Exemplo n.º 10
0
        arxiv_author_list("authorlist2marcxml.xsl"),
    ]),
    IF(
        is_submission,
        populate_submission_document,
    ),
    download_documents,
    normalize_journal_titles,
    refextract,
    count_reference_coreness,
    extract_journal_info,
    populate_journal_coverage,
]

INIT_MARKS = [
    mark('auto-approved', None),
    mark('already-in-holding-pen', None),
    mark('previously_rejected', None),
    mark('is-update', None),
    mark('stopped-matched-holdingpen-wf', None),
    mark('approved', None),
    mark('unexpected-workflow-path', None), save_workflow
]

PRE_PROCESSING = [
    # Make sure schema is set for proper indexing in Holding Pen
    set_schema,
    INIT_MARKS,
    validate_record('hep')
]
Exemplo n.º 11
0
                  ticket_id_key="ticket_id"),
    reply_ticket(template="literaturesuggest/tickets/user_submitted.html",
                 context_factory=reply_ticket_context,
                 keep_new=True),
]

ADD_INGESTION_MARKS = [
    # Article matching for non-submissions
    # ====================================
    # Query holding pen to see if we already have this article ingested
    #
    # NOTE on updates:
    #     If the same article has been harvested before and the
    #     ingestion has been completed, process is continued
    #     to allow for updates.
    IF(pending_in_holding_pen, [mark('delete', True)]),
    IF(
        is_arxiv_paper,
        [
            # FIXME: This filtering step should be removed when this
            #        workflow includes arXiv CORE harvesting
            IF(already_harvested, [
                mark('already-ingested', True),
                mark('stop', True),
            ]),
            # FIXME: This filtering step should be removed when:
            #        old previously rejected records are treated
            #        differently e.g. good auto-reject heuristics or better
            #        time based filtering (5 days is quite random now).
            IF(previously_rejected(), [
                mark('already-ingested', True),
Exemplo n.º 12
0
    ),
]


ADD_INGESTION_MARKS = [
    # Article matching for non-submissions
    # ====================================
    # Query holding pen to see if we already have this article ingested
    #
    # NOTE on updates:
    #     If the same article has been harvested before and the
    #     ingestion has been completed, process is continued
    #     to allow for updates.
    IF(
        pending_in_holding_pen,
        [mark('delete', True)]
    ),
    IF(
        is_arxiv_paper,
        [
            # FIXME: This filtering step should be removed when this
            #        workflow includes arXiv CORE harvesting
            IF(
                already_harvested,
                [
                    mark('already-ingested', True),
                    mark('stop', True),
                ]
            ),
            # FIXME: This filtering step should be removed when:
            #        old previously rejected records are treated
Exemplo n.º 13
0
    ),
]


ADD_INGESTION_MARKS = [
    # Article matching for non-submissions
    # ====================================
    # Query holding pen to see if we already have this article ingested
    #
    # NOTE on updates:
    #     If the same article has been harvested before and the
    #     ingestion has been completed, process is continued
    #     to allow for updates.
    IF(
        is_marked('already-in-holding-pen'),
        [mark('delete', True)]
    ),
    IF(
        is_arxiv_paper,
        [
            # FIXME: This filtering step should be removed when this
            #        workflow includes arXiv CORE harvesting
            IF(
                already_harvested,
                [
                    mark('already-ingested', True),
                    mark('stop', True),
                ]
            ),
            # FIXME: This filtering step should be removed when:
            #        old previously rejected records are treated
Exemplo n.º 14
0
        taxonomy="HEPont.rdf",
        only_core_tags=False,
        spires=True,
        with_author_keywords=True,
    ),
    filter_core_keywords,
    guess_categories,
    IF(
        is_experimental_paper,
        guess_experiments,
    ),
    guess_keywords,
    guess_coreness,
    IF_ELSE(
        is_submission,
        mark('auto-approved', False),
        IF_ELSE(
            belongs_to_relevant_category,
            [
                mark('auto-approved', True),
                set_coreness_in_extra_data,
            ],
            mark('auto-approved', False),
        ),
    ),
]

NOTIFY_NOT_ACCEPTED = [
    IF(
        is_submission,
        reply_ticket(context_factory=reply_ticket_context),
Exemplo n.º 15
0
        taxonomy="HEPont.rdf",
        only_core_tags=False,
        spires=True,
        with_author_keywords=True,
    ),
    filter_core_keywords,
    guess_categories,
    IF(
        is_experimental_paper,
        guess_experiments,
    ),
    guess_keywords,
    guess_coreness,
    IF_ELSE(
        is_submission,
        mark('auto-approved', False),
        IF_ELSE(
            auto_approve,
            [
                mark('auto-approved', True),
                set_core_in_extra_data,
            ],
            mark('auto-approved', False),
        ),
    ),
]


NOTIFY_NOT_ACCEPTED = [
    IF(
        is_submission,