Пример #1
0
def _get_art_context(record):
    reader = LiteratureReader(record)

    abstract = reader.abstract
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ""

    return {
        "abstract": abstract,
        "abstract_language": abstract_language,
        "arxiv_id": reader.arxiv_id,
        "authors": get_authors(record),
        "collaborations": reader.collaborations,
        "divulgation": get_divulgation(record),
        "doi": reader.doi,
        "domains": get_domains(record),
        "inspire_id": reader.inspire_id,
        "journal_issue": reader.journal_issue,
        "journal_title": reader.journal_title,
        "journal_volume": reader.journal_volume,
        "keywords": reader.keywords,
        "language": reader.language,
        "page_artid": reader.get_page_artid(),
        "peer_reviewed": 1 if reader.peer_reviewed else 0,
        "publication_date": reader.publication_date,
        "subtitle": reader.subtitle,
        "title": reader.title,
    }
Пример #2
0
def _get_art_context(record):
    reader = LiteratureReader(record)

    abstract = reader.abstract
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ''

    return {
        'abstract': abstract,
        'abstract_language': abstract_language,
        'arxiv_id': reader.arxiv_id,
        'authors': get_authors(record),
        'collaborations': reader.collaborations,
        'divulgation': get_divulgation(record),
        'doi': reader.doi,
        'domains': get_domains(record),
        'inspire_id': reader.inspire_id,
        'journal_issue': reader.journal_issue,
        'journal_title': reader.journal_title,
        'journal_volume': reader.journal_volume,
        'keywords': reader.keywords,
        'language': reader.language,
        'page_artid': reader.get_page_artid(),
        'peer_reviewed': 1 if reader.peer_reviewed else 0,
        'publication_date': reader.publication_date,
        'subtitle': reader.subtitle,
        'title': reader.title,
    }
Пример #3
0
def start_merger(head_id, update_id, current_user_id=None):
    """Start a new ManualMerge workflow to merge two records manually.

    Args:
        head_id: the id of the first record to merge. This record is the one
            that will be updated with the new information.
        update_id: the id of the second record to merge. This record is the
            one that is going to be deleted and replaced by `head`.
        current_user_id: Id of the current user provided by the Flask app.

    Returns:
        (int): the current workflow object's id.
    """
    data = {
        'pid_type': 'lit',  # TODO: support
        'recid_head': head_id,
        'recid_update': update_id,
    }

    head = get_db_record('lit', head_id)
    update = get_db_record('lit', update_id)

    workflow_object = workflow_object_class.create(
        data=None,
        id_user=current_user_id,
        data_type='hep'
    )

    wf_id = workflow_object.id    # to retrieve it later
    workflow_object.extra_data.update(data)

    update_source = LiteratureReader(update).source
    update_source = update_source if update_source else 'arxiv'

    workflow_object.extra_data['update_source'] = update_source.lower()

    workflow_object.extra_data['head_control_number'] = head_id
    workflow_object.extra_data['update_control_number'] = update_id

    workflow_object.extra_data['head_uuid'] = str(head.id)
    workflow_object.extra_data['update_uuid'] = str(update.id)

    workflow_object.extra_data['head'] = head
    workflow_object.extra_data['update'] = update

    workflow_object.save()

    start('manual_merge', object_id=wf_id)

    return wf_id
Пример #4
0
def merge_articles(obj, eng):
    """Merge two articles.

    The workflow payload is overwritten by the merged record, the conflicts are
    stored in ``extra_data.conflicts``. Also, it adds a ``callback_url`` which
    contains the endpoint which resolves the merge conflicts.

    Note:
        When the feature flag ``FEATURE_FLAG_ENABLE_MERGER`` is ``False`` it
        will skip the merge.

    """
    if not current_app.config.get('FEATURE_FLAG_ENABLE_MERGER'):
        return None

    matched_control_number = obj.extra_data['matches']['approved']

    head_uuid = PersistentIdentifier.get('lit',
                                         matched_control_number).object_uuid

    head_record = InspireRecord.get_record(head_uuid)
    update = obj.data
    update_source = LiteratureReader(obj.data).source
    head_root = read_wf_record_source(record_uuid=head_record.id,
                                      source=update_source.lower())
    head_root = head_root.json if head_root else {}

    obj.extra_data['head_uuid'] = str(head_uuid)
    obj.extra_data['head_version_id'] = head_record.model.version_id
    obj.extra_data['merger_head_revision'] = head_record.revision_id
    obj.extra_data['merger_original_root'] = deepcopy(head_root)

    merged, conflicts = merge(
        head=head_record.to_dict(),
        root=head_root,
        update=update,
    )

    obj.data = merged

    if conflicts:
        obj.extra_data['conflicts'] = conflicts
        obj.extra_data['conflicts_metadata'] = {
            'datetime': datetime.now().strftime("%b %d, %Y, %H:%M:%S %p"),
            'update_source': update_source,
        }
        obj.extra_data['callback_url'] = \
            get_resolve_merge_conflicts_callback_url()
    obj.save()
Пример #5
0
def start_merger(head_id, update_id, current_user_id=None):
    """Start a new ManualMerge workflow to merge two records manually.

    Args:
        head_id: the id of the first record to merge. This record is the one
            that will be updated with the new information.
        update_id: the id of the second record to merge. This record is the
            one that is going to be deleted and replaced by `head`.
        current_user_id: Id of the current user provided by the Flask app.

    Returns:
        (int): the current workflow object's id.
    """
    data = {
        'pid_type': 'lit',  # TODO: support
        'recid_head': head_id,
        'recid_update': update_id,
    }

    head = get_db_record('lit', head_id)
    update = get_db_record('lit', update_id)

    workflow_object = workflow_object_class.create(data=None,
                                                   id_user=current_user_id,
                                                   data_type='hep')

    wf_id = workflow_object.id  # to retrieve it later
    workflow_object.extra_data.update(data)

    update_source = LiteratureReader(update).source
    update_source = update_source if update_source else 'arxiv'

    workflow_object.extra_data['update_source'] = update_source.lower()

    workflow_object.extra_data['head_control_number'] = head_id
    workflow_object.extra_data['update_control_number'] = update_id

    workflow_object.extra_data['head_uuid'] = str(head.id)
    workflow_object.extra_data['update_uuid'] = str(update.id)

    workflow_object.extra_data['head'] = head
    workflow_object.extra_data['update'] = update

    workflow_object.save()

    start('manual_merge', object_id=wf_id)

    return wf_id
Пример #6
0
def populate_arxiv_document(obj, eng):
    arxiv_id = LiteratureReader(obj.data).arxiv_id

    for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'):
        url = current_app.config[conf_name].format(arxiv_id=arxiv_id)
        is_valid_pdf_link = is_pdf_link(url)
        if is_valid_pdf_link:
            break
        try:
            if NO_PDF_ON_ARXIV in requests.get(url).content:
                obj.log.info('No PDF is available for %s', arxiv_id)
                return
        except requests.exceptions.RequestException:
            raise DownloadError("Error accessing url {url}".format(url=url))

    if not is_valid_pdf_link:
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record
Пример #7
0
def get_note(data, doc_type):
    """Write and addendum/errata information to the BibTeX note field.

    Traverse publication_info looking for erratum and addendum in `publication_info.material`
    field and build a string of references to those publication entries.

    Returns:
        string: formatted list of the errata and addenda available for a given record

    """
    notices = ('erratum', 'addendum')
    entries = [entry for entry in get_value(data, 'publication_info', []) if entry.get('material') in notices]

    if not entries:
        return None

    note_strings = [
        text_type('{field}: {journal} {volume}, {pages} {year}').format(
            field=entry['material'].title(),
            journal=entry.get('journal_title'),
            volume=entry.get('journal_volume'),
            pages=LiteratureReader.get_page_artid_for_publication_info(entry, '--'),
            year='({})'.format(entry['year']) if 'year' in entry else ''
        ).strip()
        for entry in entries
    ]

    note_string = '[' + ', '.join(note_strings) + ']'
    note_string = re.sub(' +', ' ', note_string)  # Remove possible multiple spaces
    return re.sub(',,', ',', note_string)         # ... and commas
Пример #8
0
def get_note(data, doc_type):
    """Write and addendum/errata information to the BibTeX note field.

    Traverse publication_info looking for erratum and addendum in `publication_info.material`
    field and build a string of references to those publication entries.

    Returns:
        string: formatted list of the errata and addenda available for a given record

    """
    notices = ('erratum', 'addendum')
    entries = [entry for entry in get_value(data, 'publication_info', []) if entry.get('material') in notices]

    if not entries:
        return None

    note_strings = [
        text_type('{field}: {journal} {volume}, {pages} {year}').format(
            field=entry['material'].title(),
            journal=entry.get('journal_title'),
            volume=entry.get('journal_volume'),
            pages=LiteratureReader.get_page_artid_for_publication_info(entry, '--'),
            year='({})'.format(entry['year']) if 'year' in entry else ''
        ).strip()
        for entry in entries
    ]

    note_string = '[' + ', '.join(note_strings) + ']'
    note_string = re.sub(' +', ' ', note_string)  # Remove possible multiple spaces
    return re.sub(',,', ',', note_string)         # ... and commas
Пример #9
0
def arxiv_derive_inspire_categories(obj, eng):
    """Derive ``inspire_categories`` from the arXiv categories.

    Uses side effects to populate the ``inspire_categories`` key
    in ``obj.data`` by converting its arXiv categories.

    Args:
        obj (WorkflowObject): a workflow object.
        eng (WorkflowEngine): a workflow engine.

    Returns:
        None

    """
    obj.data.setdefault('inspire_categories', [])

    for arxiv_category in LiteratureReader(obj.data).arxiv_categories:
        term = classify_field(arxiv_category)
        if term:
            inspire_category = {
                'source': 'arxiv',
                'term': term,
            }

            if inspire_category not in obj.data['inspire_categories']:
                obj.data['inspire_categories'].append(inspire_category)
Пример #10
0
def _is_art(record):
    reader = LiteratureReader(record)
    document_types = reader.document_types
    published = reader.is_published

    return ARTICLE_LIKE_DOCUMENT_TYPES.intersection(
        document_types) and published
Пример #11
0
def reply_ticket_context(user, obj):
    """Context for literature replies."""
    return dict(
        object=obj,
        user=user,
        title=LiteratureReader(obj.data).title,
        reason=obj.extra_data.get("reason", ""),
        record_url=obj.extra_data.get("url", ""),
    )
Пример #12
0
    def conference_information(self):
        """Conference information.

        Returns a list with information about conferences related to the
        record.
        """
        conf_info = []
        for pub_info in self['publication_info']:
            conference_recid = None
            parent_recid = None
            parent_rec = {}
            conference_rec = {}
            if 'conference_record' in pub_info:
                conference_rec = replace_refs(pub_info['conference_record'],
                                              'es')
                if conference_rec and conference_rec.get('control_number'):
                    conference_recid = conference_rec['control_number']
                else:
                    conference_rec = {}
            if 'parent_record' in pub_info:
                parent_rec = replace_refs(pub_info['parent_record'], 'es')
                if parent_rec and parent_rec.get('control_number'):
                    parent_recid = parent_rec['control_number']
                else:
                    parent_rec = {}
            conf_info.append({
                "conference_recid":
                conference_recid,
                "conference_title":
                LiteratureReader(conference_rec).title,
                "parent_recid":
                parent_recid,
                "parent_title":
                LiteratureReader(parent_rec).title.replace(
                    "Proceedings, ", "", 1),
                "page_start":
                pub_info.get('page_start'),
                "page_end":
                pub_info.get('page_end'),
                "artid":
                pub_info.get('artid'),
            })

        return conf_info
Пример #13
0
def test_manual_merge_existing_records(workflow_app):

    json_head = fake_record('This is the HEAD', 1)
    json_update = fake_record('While this is the update', 2)

    # this two fields will create a merging conflict
    json_head['core'] = True
    json_update['core'] = False

    head = InspireRecord.create_or_update(json_head, skip_files=False)
    head.commit()
    update = InspireRecord.create_or_update(json_update, skip_files=False)
    update.commit()
    head_id = head.id
    update_id = update.id

    obj_id = start_merger(
        head_id=1,
        update_id=2,
        current_user_id=1,
    )

    do_resolve_manual_merge_wf(workflow_app, obj_id)

    # retrieve it again, otherwise Detached Instance Error
    obj = workflow_object_class.get(obj_id)

    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data['approved'] is True
    assert obj.extra_data['auto-approved'] is False

    # no root present before
    last_root = read_wf_record_source(head_id, 'arxiv')
    assert last_root is None

    update_source = LiteratureReader(update).source
    root_update = read_wf_record_source(update_id, update_source)
    assert root_update is None

    # check that head's content has been replaced by merged
    deleted_record = RecordMetadata.query.filter_by(id=update_id).one()

    latest_record = get_db_record('lit', 1)

    assert deleted_record.json['deleted'] is True

    # check deleted record is linked in the latest one
    deleted_rec_ref = {'$ref': 'http://localhost:5000/api/literature/2'}
    assert [deleted_rec_ref] == latest_record['deleted_records']

    # check the merged record is linked in the deleted one
    new_record_metadata = {'$ref': 'http://localhost:5000/api/literature/1'}
    assert new_record_metadata == deleted_record.json['new_record']

    del latest_record['deleted_records']
    assert latest_record == obj.data  # -> resulted merged record
Пример #14
0
def get_institution_papers_datatables_rows(hits):
    """Row used by datatables to render institution papers."""
    result = []

    title_html = "<a href='/literature/{id}'>{name}</a>"

    for hit in hits:
        row = []
        title = LiteratureReader(hit.to_dict()).title
        row.append(
            title_html.format(
                id=hit.control_number,
                name=title.encode('utf8')
            )
        )
        ctx = {
            'record': hit.to_dict(),
            'is_brief': 'true',
            'number_of_displayed_authors': 1,
            'show_affiliations': 'false',
            'collaboration_only': 'true'
        }
        row.append(render_macro_from_template(
            name="render_record_authors",
            template="inspirehep_theme/format/record/Inspire_Default_HTML_general_macros.tpl",
            ctx=ctx
        )
        )
        try:
            row.append(hit.publication_info[0].journal_title)
        except AttributeError:
            row.append('')

        try:
            row.append(hit.citation_count)
        except AttributeError:
            row.append(0)

        row.append(hit.earliest_date.split('-')[0])

        result.append(row)

    return result
Пример #15
0
def get_institution_papers_datatables_rows(hits):
    """Row used by datatables to render institution papers."""
    result = []

    title_html = "<a href='/literature/{id}'>{name}</a>"

    for hit in hits:
        row = []
        title = LiteratureReader(hit.to_dict()).title
        row.append(
            title_html.format(
                id=hit.control_number,
                name=title.encode('utf8')
            )
        )
        ctx = {
            'record': hit.to_dict(),
            'is_brief': 'true',
            'number_of_displayed_authors': 1,
            'show_affiliations': 'false',
            'collaboration_only': 'true'
        }
        row.append(render_macro_from_template(
            name="render_record_authors",
            template="inspirehep_theme/format/record/Inspire_Default_HTML_general_macros.tpl",
            ctx=ctx
        )
        )
        try:
            row.append(hit.publication_info[0].journal_title)
        except AttributeError:
            row.append('')

        try:
            row.append(hit.citation_count)
        except AttributeError:
            row.append(0)

        row.append(hit.earliest_date.split('-')[0])

        result.append(row)

    return result
Пример #16
0
    def _author_list(obj, eng):
        arxiv_id = LiteratureReader(obj.data).arxiv_id
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        try:
            tarball = obj.files[filename]
        except KeyError:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found'
                % filename)
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [
                path for path in file_list if path.endswith('.xml')
            ]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(
                        marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors
Пример #17
0
def _build_publication(record):
    reader = LiteratureReader(record)
    return {
        'abstract': reader.abstract,
        'authors': _get_authors(record),
        'collaborations': reader.collaborations,
        'keywords': reader.keywords,
        'publication_id': record['control_number'],
        'title': reader.title,
        'topics': reader.inspire_categories,
    }
Пример #18
0
def test_that_db_changes_are_mirrored_in_es(app):
    search = LiteratureSearch()
    json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        'document_type': [
            'article',
        ],
        'titles': [
            {
                'title': 'foo'
            },
        ],
        '_collections': ['Literature']
    }

    # When a record is created in the DB, it is also created in ES.

    record = InspireRecord.create(json)
    record.commit()
    db.session.commit()
    es_record = search.get_source(record.id)

    assert LiteratureReader(es_record).title == 'foo'

    # When a record is updated in the DB, is is also updated in ES.

    record['titles'][0]['title'] = 'bar'
    record.commit()
    db.session.commit()
    es_record = search.get_source(record.id)

    assert LiteratureReader(es_record).title == 'bar'

    # When a record is deleted in the DB, it is also deleted in ES.

    record._delete(force=True)
    db.session.commit()

    with pytest.raises(NotFoundError):
        es_record = search.get_source(record.id)
Пример #19
0
def is_submission(obj, eng):
    """Check if a workflow contains a submission.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: whether the workflow contains a submission.

    """
    source = LiteratureReader(obj.data).method
    return source == 'submitter'
Пример #20
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(
            extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.',
                     len(extracted_raw_references))
        obj.data['references'] = match_references_based_on_flag(
            extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(
                extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references_based_on_flag(
                pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(
            extract_references_from_text(text, source))
        matched_text_references = match_references_based_on_flag(
            text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.',
                     len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.',
                     len(matched_text_references))
        obj.data['references'] = matched_text_references
Пример #21
0
def _get_comm_context(record):
    lit_reader = LiteratureReader(record)
    abstract = lit_reader.abstract
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ""

    conference_record = get_conference_record(record)
    conference_title = get_value(conference_record, "titles.title[0]")
    conf_reader = ConferenceReader(conference_record)

    return {
        "abstract": abstract,
        "abstract_language": abstract_language,
        "arxiv_id": lit_reader.arxiv_id,
        "authors": get_authors(record),
        "collaborations": lit_reader.collaborations,
        "conference_city": conf_reader.city,
        "conference_country": conf_reader.country,
        "conference_end_date": conf_reader.end_date,
        "conference_start_date": conf_reader.start_date,
        "conference_title": conference_title,
        "divulgation": get_divulgation(record),
        "doi": lit_reader.doi,
        "domains": get_domains(record),
        "inspire_id": lit_reader.inspire_id,
        "journal_issue": lit_reader.journal_issue,
        "journal_title": lit_reader.journal_title,
        "journal_volume": lit_reader.journal_volume,
        "keywords": lit_reader.keywords,
        "language": lit_reader.language,
        "page_artid": lit_reader.get_page_artid(),
        "peer_reviewed": 1 if lit_reader.peer_reviewed else 0,
        "publication_date": lit_reader.publication_date,
        "subtitle": lit_reader.subtitle,
        "title": lit_reader.title,
    }
Пример #22
0
def _get_comm_context(record):
    lit_reader = LiteratureReader(record)
    abstract = lit_reader.abstract
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ''

    conference_record = get_conference_record(record)
    conference_title = get_value(conference_record, 'titles.title[0]')
    conf_reader = ConferenceReader(conference_record)

    return {
        'abstract': abstract,
        'abstract_language': abstract_language,
        'arxiv_id': lit_reader.arxiv_id,
        'authors': get_authors(record),
        'collaborations': lit_reader.collaborations,
        'conference_city': conf_reader.city,
        'conference_country': conf_reader.country,
        'conference_end_date': conf_reader.end_date,
        'conference_start_date': conf_reader.start_date,
        'conference_title': conference_title,
        'divulgation': get_divulgation(record),
        'doi': lit_reader.doi,
        'domains': get_domains(record),
        'inspire_id': lit_reader.inspire_id,
        'journal_issue': lit_reader.journal_issue,
        'journal_title': lit_reader.journal_title,
        'journal_volume': lit_reader.journal_volume,
        'keywords': lit_reader.keywords,
        'language': lit_reader.language,
        'page_artid': lit_reader.get_page_artid(),
        'peer_reviewed': 1 if lit_reader.peer_reviewed else 0,
        'publication_date': lit_reader.publication_date,
        'subtitle': lit_reader.subtitle,
        'title': lit_reader.title,
    }
Пример #23
0
def new_ticket_context(user, obj):
    """Context for literature new tickets."""
    title = LiteratureReader(obj.data).title
    subject = u"Your suggestion to INSPIRE: {0}".format(title)
    user_comment = obj.extra_data.get('formdata', {}).get('extra_comments', '')
    identifiers = get_value(obj.data, "external_system_numbers.value") or []
    return dict(email=user.email,
                title=title,
                identifier=identifiers or "",
                user_comment=user_comment,
                references=obj.extra_data.get('formdata',
                                              {}).get('references'),
                object=obj,
                subject=subject)
Пример #24
0
def has_fully_harvested_category(record):
    """Check if the record in `obj.data` has fully harvested categories.

    Arguments:
        record(dict): the ingested article.

    Return:
        bool: True when the record belongs to an arXiv category that is fully
        harvested, otherwise False.
    """
    record_categories = set(LiteratureReader(record).arxiv_categories)
    harvested_categories = current_app.config.get('ARXIV_CATEGORIES', {})
    return len(record_categories & set(
        harvested_categories.get('core') +
        harvested_categories.get('non-core'))) > 0
Пример #25
0
def arxiv_package_download(obj, eng):
    """Perform the package download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = LiteratureReader(obj.data).arxiv_id
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = download_file_to_workflow(
        workflow=obj,
        name=filename,
        url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id),
    )

    if tarball:
        obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id)
    else:
        obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)
Пример #26
0
def render_contributions(hits):
    """Render a list of conferences to HTML."""

    result = []

    title_html = u"<a href='/literature/{id}'>{name}</a>"

    for hit in hits:
        row = []
        title = LiteratureReader(hit.to_dict()).title
        row.append(
            title_html.format(
                id=hit.control_number,
                name=title
            )
        )
        ctx = {
            'record': hit.to_dict(),
            'is_brief': 'true',
            'number_of_displayed_authors': 1,
            'show_affiliations': 'false',
            'collaboration_only': 'true'
        }
        row.append(render_macro_from_template(
            name="render_record_authors",
            template="inspirehep_theme/format/record/Inspire_Default_HTML_general_macros.tpl",
            ctx=ctx
        )
        )
        try:
            row.append(hit.publication_info[0].journal_title)
        except AttributeError:
            row.append('')

        try:
            row.append(hit.citation_count)
        except AttributeError:
            row.append(0)

        result.append(row)

    return result, hits.total
Пример #27
0
def is_arxiv_paper(obj, eng):
    """Check if a workflow contains a paper from arXiv.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: whether the workflow contains a paper from arXiv.

    """
    reader = LiteratureReader(obj.data)
    method = reader.method
    source = reader.source

    is_submission_with_arxiv = method == 'submitter' and 'arxiv_eprints' in obj.data
    is_harvested_from_arxiv = method == 'hepcrawl' and source.lower(
    ) == 'arxiv'

    return is_submission_with_arxiv or is_harvested_from_arxiv
Пример #28
0
    def build(cls, record):
        """Build Publication object from record dictionary

        Args:
            record (dict): dictionary containing record data

        Returns:
            Publication: Object built from provided data

        """
        reader = LiteratureReader(record)
        return cls(
            **{
                "abstract": reader.abstract,
                "authors": get_authors_full_names(record),
                "collaborations": reader.collaborations,
                "keywords": reader.keywords,
                "publication_id": record["control_number"],
                "title": reader.title,
                "topics": reader.inspire_categories,
            })
Пример #29
0
def is_experimental_paper(obj, eng):
    """Check if a workflow contains an experimental paper.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: whether the workflow contains an experimental paper.

    """
    reader = LiteratureReader(obj.data)
    arxiv_categories = reader.arxiv_categories
    inspire_categories = reader.inspire_categories

    has_experimental_arxiv_category = len(
        set(arxiv_categories) & set(EXPERIMENTAL_ARXIV_CATEGORIES)) > 0
    has_experimental_inspire_category = len(
        set(inspire_categories) & set(EXPERIMENTAL_INSPIRE_CATEGORIES)) > 0

    return has_experimental_arxiv_category or has_experimental_inspire_category
def test_manual_merge_existing_records(mock_put_record_to_hep,
                                       mock_store_records, workflow_app):

    json_head = fake_record('This is the HEAD', 1)
    json_update = fake_record('While this is the update', 2)

    # this two fields will create a merging conflict
    json_head['core'] = True
    json_update['core'] = False

    head = InspireRecord.create_or_update(json_head, skip_files=False)
    head.commit()
    update = InspireRecord.create_or_update(json_update, skip_files=False)
    update.commit()
    head_id = head.id
    update_id = update.id

    obj_id = start_merger(
        head_id=1,
        update_id=2,
        current_user_id=1,
    )

    do_resolve_manual_merge_wf(workflow_app, obj_id)
    mock_put_record_to_hep.assert_called()

    # retrieve it again, otherwise Detached Instance Error
    obj = workflow_object_class.get(obj_id)

    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data['approved'] is True
    assert obj.extra_data['auto-approved'] is False

    # no root present before
    last_root = read_wf_record_source(head_id, 'arxiv')
    assert last_root is None

    update_source = LiteratureReader(update).source
    root_update = read_wf_record_source(update_id, update_source)
    assert root_update is None
Пример #31
0
def _get_preprint_context(record):
    reader = LiteratureReader(record)
    abstract = reader.abstract
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ''

    return {
        'abstract': abstract,
        'abstract_language': abstract_language,
        'arxiv_id': reader.arxiv_id,
        'authors': get_authors(record),
        'collaborations': reader.collaborations,
        'divulgation': get_divulgation(record),
        'domains': get_domains(record),
        'inspire_id': reader.inspire_id,
        'keywords': reader.keywords,
        'language': reader.language,
        'subtitle': reader.subtitle,
        'title': reader.title,
    }
Пример #32
0
def _get_preprint_context(record):
    reader = LiteratureReader(record)
    abstract = reader.abstract
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ""

    return {
        "abstract": abstract,
        "abstract_language": abstract_language,
        "arxiv_id": reader.arxiv_id,
        "authors": get_authors(record),
        "collaborations": reader.collaborations,
        "divulgation": get_divulgation(record),
        "domains": get_domains(record),
        "inspire_id": reader.inspire_id,
        "keywords": reader.keywords,
        "language": reader.language,
        "subtitle": reader.subtitle,
        "title": reader.title,
    }
Пример #33
0
def store_root(obj, eng):
    """Insert or update the current record head's root into the ``WorkflowsRecordSources`` table."""
    if not current_app.config.get('FEATURE_FLAG_ENABLE_MERGER', False):
        obj.log.info(
            'skipping storing source root, feature flag ``FEATURE_FLAG_ENABLE_MERGER`` is disabled.'
        )
        return

    root = obj.extra_data['merger_root']
    head_uuid = obj.extra_data['head_uuid']

    source = LiteratureReader(root).source.lower()

    if not source:
        return

    root_record = WorkflowsRecordSources(
        source=get_source_for_root(source),
        record_uuid=head_uuid,
        json=root,
    )
    db.session.merge(root_record)
    db.session.commit()
Пример #34
0
def get_pages(data, doc_type):
    pub_info = get_best_publication_info(data)
    return LiteratureReader.get_page_artid_for_publication_info(pub_info, '--')