Exemplo n.º 1
0
def _files(record, extra_data):
    """Check if it has the necessary files: .xml, .pdf, .pdfa """

    journal = get_first_journal(record)
    required_files = current_app.config.get('COMPLIANCE_JOURNAL_FILES',
                                            {}).get(journal)

    if not required_files:
        return True, ('No required files defined!', ), None

    available_files = {f.get('filetype') for f in record.get('_files', ())}

    check_accepted = required_files == available_files
    details = []

    if not check_accepted:
        missing_files = ', '.join(required_files - available_files)
        if missing_files:
            details.append('Missing files: %s' % missing_files)

        extra_files = ', '.join(available_files - required_files)
        if extra_files:
            details.append('Extra files: %s' % extra_files)

    return check_accepted, details, None
Exemplo n.º 2
0
def _arxiv(record, extra_data):
    # if not available it is only compliant if the arXiv check is not mandatory for the journal
    journal = get_first_journal(record)
    if journal not in current_app.config.get('ARTICLE_CHECK_HAS_TO_BE_HEP'):
        return True, ("Doesn't have to be hep", ), None

    # get the primary category
    primary = get_arxiv_primary_category(record)
    if primary:
        check_accepted = primary in current_app.config.get(
            'ARXIV_HEP_CATEGORIES')
        return check_accepted, ('Primary category: %s' % primary, ), None

    return False, ('No arXiv id', ), None
Exemplo n.º 3
0
def affiliations_export(country=None, year=None):
    """
    Creates affiliation data filtered by country and year.

    :param country: only affiliations for this country will be included. If None, all countries are included.
    :param year: only articles *published* in this year will be included. If None, all articles are included.
    """

    size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100)
    search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX')
    source_fields = [
        'publication_info.year',
        'publication_info.journal_title',
        'arxiv_eprints',
        'dois',
        'authors',
        'control_number',
    ]

    result_headers = [
        'year', 'journal', 'doi', 'arxiv number', 'primary arxiv category',
        'country', 'affiliation', 'authors with affiliation',
        'total number of authors'
    ]
    result_data = []
    index = 0

    # query ElasticSearch for result (and get total hits)
    query = get_query_string(country=country, year=year)
    search_results = current_search_client.search(q=query,
                                                  index=search_index,
                                                  _source=source_fields,
                                                  size=size,
                                                  from_=index)

    total_hits = search_results['hits']['total']['value']
    logger.info(
        'Searching for affiliations of country: {} and year: {}'.format(
            country if country else 'ALL', year if year else 'ALL'))
    logger.info('Total results from query: {}'.format(total_hits))

    if total_hits == 0:
        return {'header': result_headers, 'data': result_data}

    while index < total_hits:
        # query ElasticSearch for result
        logger.warn('INDEX NUMBER {}'.format(index))
        search_results = current_search_client.search(q=query,
                                                      index=search_index,
                                                      _source=source_fields,
                                                      size=size,
                                                      from_=index)
        index += len(search_results['hits']['hits'])

        # extract and add data to result list
        for hit in search_results['hits']['hits']:
            record = hit['_source']

            year = record['publication_info'][0]['year']
            journal = get_first_journal(record)
            doi = get_first_doi(record)
            arxiv = get_clean_arXiv_id(record)
            arxiv_category = get_arxiv_primary_category(record)

            authors = record.get('authors', ())
            total_authors = len(authors)
            missing_author_affiliations = 0

            extracted_affiliations = Counter()
            for author in authors:
                # if there are no affiliations, we cannot add this author
                # (this also means the record is not valid according to the schema)
                if 'affiliations' not in author:
                    missing_author_affiliations += 1
                    continue

                # aggregate affiliations
                for aff in author['affiliations']:
                    aff_country = aff.get('country', 'UNKNOWN')
                    if country in (None, '') or aff_country == country:
                        value = ((aff['value'], aff_country), )
                        extracted_affiliations.update(value)

            if not extracted_affiliations:
                logger.warn(
                    'Article with DOI: {} had no extracted affiliations'.
                    format(doi))

            if missing_author_affiliations:
                logger.warn(
                    'Article with DOI: {} had missing affiliations in {} / {} authors'
                    .format(doi, missing_author_affiliations, total_authors))

            # add extracted information to result list
            for meta, count in extracted_affiliations.items():
                aff_value, aff_country = meta
                result_data.append([
                    year, journal, doi, arxiv, arxiv_category, aff_country,
                    aff_value, count, total_authors
                ])

    return {'header': result_headers, 'data': result_data}
Exemplo n.º 4
0
def authors_export(country=None, year=None):
    """
    Creates author and affiliation data filtered by country and year.

    :param country: only affiliations for this country will be included. If None, all countries are included.
    :param year: only articles *published* in this year will be included. If None, all articles are included.
    """

    size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100)
    search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX')
    source_fields = [
        'publication_info.year',
        'publication_info.journal_title',
        'arxiv_eprints',
        'dois',
        'authors',
        'control_number',
    ]
    query = get_query_string(country=country, year=year)

    result_data = []
    index = 0
    total_hits = None
    while total_hits is None or index < total_hits:
        # query ElasticSearch for result
        search_results = current_search_client.search(q=query,
                                                      index=search_index,
                                                      _source=source_fields,
                                                      size=size,
                                                      from_=index)
        total_hits = search_results['hits']['total']
        index += len(search_results['hits']['hits'])

        # extract and add data to result list
        for hit in search_results['hits']['hits']:
            record = hit['_source']

            year = record['publication_info'][0]['year']
            journal = get_first_journal(record)
            doi = get_first_doi(record)
            arxiv = get_clean_arXiv_id(record)
            arxiv_category = get_arxiv_primary_category(record)

            authors = record.get('authors', ())
            total_authors = len(authors)

            for author in authors:
                # if there are no affiliations, we cannot add this author
                # (this also means the record is not valid according to the schema)
                if 'affiliations' not in author:
                    logger.warn('No affiliations for author. doi=%s' % doi)
                    continue

                author_name = author.get('full_name', 'UNKNOWN')
                # add extracted information to result list
                for affiliation in author['affiliations']:
                    aff_country = affiliation.get('country', 'UNKNOWN')
                    aff_value = affiliation['value']
                    result_data.append([
                        year, journal, doi, arxiv, arxiv_category, author_name,
                        aff_country, aff_value, total_authors
                    ])

    return {
        'header': [
            'year', 'journal', 'doi', 'arxiv number', 'primary arxiv category',
            'author', 'country', 'affiliation', 'total number of authors'
        ],
        'data':
        result_data
    }