def _received_in_time(record, extra_data): """Check if publication is not older than 24h """ api_url = current_app.config.get('CROSSREF_API_URL') api_response = requests.get(api_url % get_first_doi(record)) if api_response.status_code != 200: return True, ('Article is not on crossref.', ), 'Api response: %s' % api_response.text details_message = "" api_message = api_response.json()['message'] if 'publication_info' in record and \ record['publication_info'][0]['journal_title'] == 'Progress of Theoretical and Experimental Physics': parts = api_message['published-online']['date-parts'][0] # if we don't have month or day substitute it with 1 if len(parts) < 3: parts.extend([1] * (3 - len(parts))) details_message += 'Month and/or day is missing, substitute it with "1".' # only contains day of publication, check for end of day api_time = datetime(*parts, hour=23, minute=59, second=59) time_source = '"published online" field' else: api_time = parse_date(api_message['created']['date-time'], ignoretz=True) time_source = 'crossref' received_time = parse_date(record['record_creation_date']) delta = received_time - api_time check_accepted = delta <= timedelta(hours=24) details_message += 'Arrived %d hours later then creation date on crossref.org.' % (delta.total_seconds() / 3600) debug = 'Time from %s: %s, Received time: %s' % (time_source, api_time, received_time) return check_accepted, (details_message, ), debug
def _received_in_time(record, extra_data): """Check if publication is not older than 24h """ api_url = current_app.config.get('CROSSREF_API_URL') api_response = requests.get(api_url % get_first_doi(record)) if api_response.status_code != 200: return True, ('Article is not on crossref.', ), 'Api response: %s' % api_response.text api_message = api_response.json()['message'] if 'publication_info' in record and \ record['publication_info'][0]['journal_title'] == 'Progress of Theoretical and Experimental Physics': parts = api_message['published-online']['date-parts'][0] # only contains day of publication, check for end of day api_time = datetime(*parts, hour=23, minute=59, second=59) time_source = '"published online" field' else: api_time = parse_date(api_message['created']['date-time'], ignoretz=True) time_source = 'crossref' received_time = parse_date(record['record_creation_date']) delta = received_time - api_time check_accepted = delta <= timedelta(hours=24) details_message = 'Arrived %d hours later then creation date on crossref.org.' % ( delta.total_seconds() / 3600) debug = 'Time from %s: %s, Received time: %s' % (time_source, api_time, received_time) return check_accepted, (details_message, ), debug
def check_compliance(obj, *args): if 'control_number' not in obj.data: raise ValueError( "Object should have a 'control_number' key in 'data' dict to be consistent with article upload." ) recid = obj.data['control_number'] pid = PersistentIdentifier.get('recid', recid) record = Record.get_record(pid.object_uuid) checks = {} # Add temporary data to evaluation extra_data = {'extracted_text': __extract_article_text(record)} all_checks_accepted = True for name, func in COMPLIANCE_TASKS: check_accepted, details, debug = func(record, extra_data) all_checks_accepted = all_checks_accepted and check_accepted checks[name] = { 'check': check_accepted, 'details': details, 'debug': debug } c = Compliance.get_or_create(pid.object_uuid) results = { 'checks': checks, 'accepted': all_checks_accepted, 'data': { 'doi': get_first_doi(record), 'publisher': get_abbreviated_publisher(record), 'journal': get_abbreviated_journal(record), 'arxiv': get_first_arxiv(record) } } c.add_results(results) c.id_record = pid.object_uuid db.session.add(c) db.session.commit() # send notification about failed checks need_email = current_app.config.get('COMPLIANCE_SEND_FAILED_EMAILS', True) if need_email and not all_checks_accepted and c.has_final_result_changed(): msg = TemplatedMessage( template_html='scoap3_compliance/admin/failed_email.html', subject='SCOAP3 - Compliance check', sender=current_app.config.get('MAIL_DEFAULT_SENDER'), recipients=current_app.config.get('OPERATIONS_EMAILS'), ctx={ 'results': results, 'id': '%s,%s' % (c.id, record.id), }) current_app.extensions['mail'].send(msg)
def proc(record): if not record.json: return doi = get_first_doi(record.json) if doi in old_data: result['in_both'].append(doi) old_data.pop(doi) else: result['only_in_new'].append(doi)
def check_compliance(obj, *args): if 'control_number' not in obj.data: raise ValueError("Object should have a 'control_number' key in 'data' dict to be consistent with article upload.") recid = obj.data['control_number'] pid = PersistentIdentifier.get('recid', recid) record = Record.get_record(pid.object_uuid) checks = {} # Add temporary data to evalutaion extra_data = {'extracted_text': __extract_article_text(record)} all_checks_accepted = True for name, func in COMPLIANCE_TASKS: check_accepted, details, debug = func(record, extra_data) all_checks_accepted = all_checks_accepted and check_accepted checks[name] = { 'check': check_accepted, 'details': details, 'debug': debug } c = Compliance.get_or_create(pid.object_uuid) results = { 'checks': checks, 'accepted': all_checks_accepted, 'data': { 'doi': get_first_doi(record), 'publisher': get_abbreviated_publisher(record), 'journal': get_abbreviated_journal(record), 'arxiv': get_first_arxiv(record) } } c.add_results(results) c.id_record = pid.object_uuid db.session.add(c) db.session.commit() # send notification about failed checks if not all_checks_accepted: msg = TemplatedMessage( template_html='scoap3_compliance/admin/failed_email.html', subject='SCOAP3 - Compliance check', sender=current_app.config.get('MAIL_DEFAULT_SENDER'), recipients=current_app.config.get('COMPLIANCE_EMAILS'), ctx={'results': results} ) current_app.extensions['mail'].send(msg)
def affiliations_export(country=None, year=None): """ Creates affiliation data filtered by country and year. :param country: only affiliations for this country will be included. If None, all countries are included. :param year: only articles *published* in this year will be included. If None, all articles are included. """ size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100) search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX') source_fields = [ 'publication_info.year', 'publication_info.journal_title', 'arxiv_eprints', 'dois', 'authors', 'control_number', ] result_headers = [ 'year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'country', 'affiliation', 'authors with affiliation', 'total number of authors' ] result_data = [] index = 0 # query ElasticSearch for result (and get total hits) query = get_query_string(country=country, year=year) search_results = current_search_client.search(q=query, index=search_index, _source=source_fields, size=size, from_=index) total_hits = search_results['hits']['total']['value'] logger.info( 'Searching for affiliations of country: {} and year: {}'.format( country if country else 'ALL', year if year else 'ALL')) logger.info('Total results from query: {}'.format(total_hits)) if total_hits == 0: return {'header': result_headers, 'data': result_data} while index < total_hits: # query ElasticSearch for result logger.warn('INDEX NUMBER {}'.format(index)) search_results = current_search_client.search(q=query, index=search_index, _source=source_fields, size=size, from_=index) index += len(search_results['hits']['hits']) # extract and add data to result list for hit in search_results['hits']['hits']: record = hit['_source'] year = record['publication_info'][0]['year'] journal = get_first_journal(record) doi = get_first_doi(record) arxiv = get_clean_arXiv_id(record) arxiv_category = get_arxiv_primary_category(record) authors = record.get('authors', ()) total_authors = len(authors) missing_author_affiliations = 0 extracted_affiliations = Counter() for author in authors: # if there are no affiliations, we cannot add this author # (this also means the record is not valid according to the schema) if 'affiliations' not in author: missing_author_affiliations += 1 continue # aggregate affiliations for aff in author['affiliations']: aff_country = aff.get('country', 'UNKNOWN') if country in (None, '') or aff_country == country: value = ((aff['value'], aff_country), ) extracted_affiliations.update(value) if not extracted_affiliations: logger.warn( 'Article with DOI: {} had no extracted affiliations'. format(doi)) if missing_author_affiliations: logger.warn( 'Article with DOI: {} had missing affiliations in {} / {} authors' .format(doi, missing_author_affiliations, total_authors)) # add extracted information to result list for meta, count in extracted_affiliations.items(): aff_value, aff_country = meta result_data.append([ year, journal, doi, arxiv, arxiv_category, aff_country, aff_value, count, total_authors ]) return {'header': result_headers, 'data': result_data}
def authors_export(country=None, year=None): """ Creates author and affiliation data filtered by country and year. :param country: only affiliations for this country will be included. If None, all countries are included. :param year: only articles *published* in this year will be included. If None, all articles are included. """ size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100) search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX') source_fields = [ 'publication_info.year', 'publication_info.journal_title', 'arxiv_eprints', 'dois', 'authors', 'control_number', ] query = get_query_string(country=country, year=year) result_data = [] index = 0 total_hits = None while total_hits is None or index < total_hits: # query ElasticSearch for result search_results = current_search_client.search(q=query, index=search_index, _source=source_fields, size=size, from_=index) total_hits = search_results['hits']['total'] index += len(search_results['hits']['hits']) # extract and add data to result list for hit in search_results['hits']['hits']: record = hit['_source'] year = record['publication_info'][0]['year'] journal = get_first_journal(record) doi = get_first_doi(record) arxiv = get_clean_arXiv_id(record) arxiv_category = get_arxiv_primary_category(record) authors = record.get('authors', ()) total_authors = len(authors) for author in authors: # if there are no affiliations, we cannot add this author # (this also means the record is not valid according to the schema) if 'affiliations' not in author: logger.warn('No affiliations for author. doi=%s' % doi) continue author_name = author.get('full_name', 'UNKNOWN') # add extracted information to result list for affiliation in author['affiliations']: aff_country = affiliation.get('country', 'UNKNOWN') aff_value = affiliation['value'] result_data.append([ year, journal, doi, arxiv, arxiv_category, author_name, aff_country, aff_value, total_authors ]) return { 'header': [ 'year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'author', 'country', 'affiliation', 'total number of authors' ], 'data': result_data }