예제 #1
0
def _received_in_time(record, extra_data):
    """Check if publication is not older than 24h """
    api_url = current_app.config.get('CROSSREF_API_URL')

    api_response = requests.get(api_url % get_first_doi(record))
    if api_response.status_code != 200:
        return True, ('Article is not on crossref.', ), 'Api response: %s' % api_response.text

    details_message = ""
    api_message = api_response.json()['message']

    if 'publication_info' in record and \
            record['publication_info'][0]['journal_title'] == 'Progress of Theoretical and Experimental Physics':
        parts = api_message['published-online']['date-parts'][0]
        # if we don't have month or day substitute it with 1
        if len(parts) < 3:
            parts.extend([1] * (3 - len(parts)))
            details_message += 'Month and/or day is missing, substitute it with "1".'
        # only contains day of publication, check for end of day
        api_time = datetime(*parts, hour=23, minute=59, second=59)
        time_source = '"published online" field'
    else:
        api_time = parse_date(api_message['created']['date-time'], ignoretz=True)
        time_source = 'crossref'
    received_time = parse_date(record['record_creation_date'])
    delta = received_time - api_time

    check_accepted = delta <= timedelta(hours=24)
    details_message += 'Arrived %d hours later then creation date on crossref.org.' % (delta.total_seconds() / 3600)
    debug = 'Time from %s: %s, Received time: %s' % (time_source, api_time, received_time)

    return check_accepted, (details_message, ), debug
예제 #2
0
def _received_in_time(record, extra_data):
    """Check if publication is not older than 24h """
    api_url = current_app.config.get('CROSSREF_API_URL')

    api_response = requests.get(api_url % get_first_doi(record))
    if api_response.status_code != 200:
        return True, ('Article is not on crossref.',
                      ), 'Api response: %s' % api_response.text

    api_message = api_response.json()['message']

    if 'publication_info' in record and \
            record['publication_info'][0]['journal_title'] == 'Progress of Theoretical and Experimental Physics':
        parts = api_message['published-online']['date-parts'][0]
        # only contains day of publication, check for end of day
        api_time = datetime(*parts, hour=23, minute=59, second=59)
        time_source = '"published online" field'
    else:
        api_time = parse_date(api_message['created']['date-time'],
                              ignoretz=True)
        time_source = 'crossref'
    received_time = parse_date(record['record_creation_date'])
    delta = received_time - api_time

    check_accepted = delta <= timedelta(hours=24)
    details_message = 'Arrived %d hours later then creation date on crossref.org.' % (
        delta.total_seconds() / 3600)
    debug = 'Time from %s: %s, Received time: %s' % (time_source, api_time,
                                                     received_time)

    return check_accepted, (details_message, ), debug
예제 #3
0
def check_compliance(obj, *args):
    if 'control_number' not in obj.data:
        raise ValueError(
            "Object should have a 'control_number' key in 'data' dict to be consistent with article upload."
        )

    recid = obj.data['control_number']
    pid = PersistentIdentifier.get('recid', recid)
    record = Record.get_record(pid.object_uuid)

    checks = {}

    # Add temporary data to evaluation
    extra_data = {'extracted_text': __extract_article_text(record)}

    all_checks_accepted = True
    for name, func in COMPLIANCE_TASKS:
        check_accepted, details, debug = func(record, extra_data)
        all_checks_accepted = all_checks_accepted and check_accepted
        checks[name] = {
            'check': check_accepted,
            'details': details,
            'debug': debug
        }

    c = Compliance.get_or_create(pid.object_uuid)
    results = {
        'checks': checks,
        'accepted': all_checks_accepted,
        'data': {
            'doi': get_first_doi(record),
            'publisher': get_abbreviated_publisher(record),
            'journal': get_abbreviated_journal(record),
            'arxiv': get_first_arxiv(record)
        }
    }

    c.add_results(results)
    c.id_record = pid.object_uuid

    db.session.add(c)
    db.session.commit()

    # send notification about failed checks
    need_email = current_app.config.get('COMPLIANCE_SEND_FAILED_EMAILS', True)
    if need_email and not all_checks_accepted and c.has_final_result_changed():
        msg = TemplatedMessage(
            template_html='scoap3_compliance/admin/failed_email.html',
            subject='SCOAP3 - Compliance check',
            sender=current_app.config.get('MAIL_DEFAULT_SENDER'),
            recipients=current_app.config.get('OPERATIONS_EMAILS'),
            ctx={
                'results': results,
                'id': '%s,%s' % (c.id, record.id),
            })
        current_app.extensions['mail'].send(msg)
예제 #4
0
    def proc(record):
        if not record.json:
            return

        doi = get_first_doi(record.json)
        if doi in old_data:
            result['in_both'].append(doi)
            old_data.pop(doi)
        else:
            result['only_in_new'].append(doi)
예제 #5
0
def check_compliance(obj, *args):
    if 'control_number' not in obj.data:
        raise ValueError("Object should have a 'control_number' key in 'data' dict to be consistent with article upload.")

    recid = obj.data['control_number']
    pid = PersistentIdentifier.get('recid', recid)
    record = Record.get_record(pid.object_uuid)

    checks = {}

    # Add temporary data to evalutaion
    extra_data = {'extracted_text': __extract_article_text(record)}

    all_checks_accepted = True
    for name, func in COMPLIANCE_TASKS:
        check_accepted, details, debug = func(record, extra_data)
        all_checks_accepted = all_checks_accepted and check_accepted
        checks[name] = {
            'check': check_accepted,
            'details': details,
            'debug': debug
        }

    c = Compliance.get_or_create(pid.object_uuid)
    results = {
        'checks': checks,
        'accepted': all_checks_accepted,
        'data': {
            'doi': get_first_doi(record),
            'publisher': get_abbreviated_publisher(record),
            'journal': get_abbreviated_journal(record),
            'arxiv': get_first_arxiv(record)
        }
    }

    c.add_results(results)
    c.id_record = pid.object_uuid

    db.session.add(c)
    db.session.commit()

    # send notification about failed checks
    if not all_checks_accepted:
        msg = TemplatedMessage(
            template_html='scoap3_compliance/admin/failed_email.html',
            subject='SCOAP3 - Compliance check',
            sender=current_app.config.get('MAIL_DEFAULT_SENDER'),
            recipients=current_app.config.get('COMPLIANCE_EMAILS'),
            ctx={'results': results}
        )
        current_app.extensions['mail'].send(msg)
예제 #6
0
파일: tools.py 프로젝트: SCOAP3/scoap3-next
def affiliations_export(country=None, year=None):
    """
    Creates affiliation data filtered by country and year.

    :param country: only affiliations for this country will be included. If None, all countries are included.
    :param year: only articles *published* in this year will be included. If None, all articles are included.
    """

    size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100)
    search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX')
    source_fields = [
        'publication_info.year',
        'publication_info.journal_title',
        'arxiv_eprints',
        'dois',
        'authors',
        'control_number',
    ]

    result_headers = [
        'year', 'journal', 'doi', 'arxiv number', 'primary arxiv category',
        'country', 'affiliation', 'authors with affiliation',
        'total number of authors'
    ]
    result_data = []
    index = 0

    # query ElasticSearch for result (and get total hits)
    query = get_query_string(country=country, year=year)
    search_results = current_search_client.search(q=query,
                                                  index=search_index,
                                                  _source=source_fields,
                                                  size=size,
                                                  from_=index)

    total_hits = search_results['hits']['total']['value']
    logger.info(
        'Searching for affiliations of country: {} and year: {}'.format(
            country if country else 'ALL', year if year else 'ALL'))
    logger.info('Total results from query: {}'.format(total_hits))

    if total_hits == 0:
        return {'header': result_headers, 'data': result_data}

    while index < total_hits:
        # query ElasticSearch for result
        logger.warn('INDEX NUMBER {}'.format(index))
        search_results = current_search_client.search(q=query,
                                                      index=search_index,
                                                      _source=source_fields,
                                                      size=size,
                                                      from_=index)
        index += len(search_results['hits']['hits'])

        # extract and add data to result list
        for hit in search_results['hits']['hits']:
            record = hit['_source']

            year = record['publication_info'][0]['year']
            journal = get_first_journal(record)
            doi = get_first_doi(record)
            arxiv = get_clean_arXiv_id(record)
            arxiv_category = get_arxiv_primary_category(record)

            authors = record.get('authors', ())
            total_authors = len(authors)
            missing_author_affiliations = 0

            extracted_affiliations = Counter()
            for author in authors:
                # if there are no affiliations, we cannot add this author
                # (this also means the record is not valid according to the schema)
                if 'affiliations' not in author:
                    missing_author_affiliations += 1
                    continue

                # aggregate affiliations
                for aff in author['affiliations']:
                    aff_country = aff.get('country', 'UNKNOWN')
                    if country in (None, '') or aff_country == country:
                        value = ((aff['value'], aff_country), )
                        extracted_affiliations.update(value)

            if not extracted_affiliations:
                logger.warn(
                    'Article with DOI: {} had no extracted affiliations'.
                    format(doi))

            if missing_author_affiliations:
                logger.warn(
                    'Article with DOI: {} had missing affiliations in {} / {} authors'
                    .format(doi, missing_author_affiliations, total_authors))

            # add extracted information to result list
            for meta, count in extracted_affiliations.items():
                aff_value, aff_country = meta
                result_data.append([
                    year, journal, doi, arxiv, arxiv_category, aff_country,
                    aff_value, count, total_authors
                ])

    return {'header': result_headers, 'data': result_data}
예제 #7
0
파일: tools.py 프로젝트: nyirit/scoap3-next
def authors_export(country=None, year=None):
    """
    Creates author and affiliation data filtered by country and year.

    :param country: only affiliations for this country will be included. If None, all countries are included.
    :param year: only articles *published* in this year will be included. If None, all articles are included.
    """

    size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100)
    search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX')
    source_fields = [
        'publication_info.year',
        'publication_info.journal_title',
        'arxiv_eprints',
        'dois',
        'authors',
        'control_number',
    ]
    query = get_query_string(country=country, year=year)

    result_data = []
    index = 0
    total_hits = None
    while total_hits is None or index < total_hits:
        # query ElasticSearch for result
        search_results = current_search_client.search(q=query,
                                                      index=search_index,
                                                      _source=source_fields,
                                                      size=size,
                                                      from_=index)
        total_hits = search_results['hits']['total']
        index += len(search_results['hits']['hits'])

        # extract and add data to result list
        for hit in search_results['hits']['hits']:
            record = hit['_source']

            year = record['publication_info'][0]['year']
            journal = get_first_journal(record)
            doi = get_first_doi(record)
            arxiv = get_clean_arXiv_id(record)
            arxiv_category = get_arxiv_primary_category(record)

            authors = record.get('authors', ())
            total_authors = len(authors)

            for author in authors:
                # if there are no affiliations, we cannot add this author
                # (this also means the record is not valid according to the schema)
                if 'affiliations' not in author:
                    logger.warn('No affiliations for author. doi=%s' % doi)
                    continue

                author_name = author.get('full_name', 'UNKNOWN')
                # add extracted information to result list
                for affiliation in author['affiliations']:
                    aff_country = affiliation.get('country', 'UNKNOWN')
                    aff_value = affiliation['value']
                    result_data.append([
                        year, journal, doi, arxiv, arxiv_category, author_name,
                        aff_country, aff_value, total_authors
                    ])

    return {
        'header': [
            'year', 'journal', 'doi', 'arxiv number', 'primary arxiv category',
            'author', 'country', 'affiliation', 'total number of authors'
        ],
        'data':
        result_data
    }