def test_non_software_doi(self): doi_id = "10.1016/j.dsr2.2008.10.030" # Not software expected_response_content = '' expected_parsed_response = {} httpretty.enable( ) # enable HTTPretty so that it will monkey patch the socket module httpretty.register_uri(httpretty.GET, self.app.conf['DOI_URL'] + doi_id, body=expected_response_content) raw_metadata = doi.fetch_metadata(self.app.conf['DOI_URL'], self.app.conf['DATACITE_URL'], doi_id) parsed_metadata = doi.parse_metadata(raw_metadata) self.assertEqual(raw_metadata, expected_response_content) self.assertEqual(parsed_metadata, expected_parsed_response) httpretty.disable() httpretty.reset() # clean up registered urls and request history
def test_software_doi(self): doi_id = "10.5281/zenodo.11020" # software expected_response_content = self.mock_data[doi_id]['raw'] expected_parsed_response = self.mock_data[doi_id]['parsed'] httpretty.enable( ) # enable HTTPretty so that it will monkey patch the socket module httpretty.register_uri(httpretty.GET, self.app.conf['DOI_URL'] + doi_id, body=expected_response_content) raw_metadata = doi.fetch_metadata(self.app.conf['DOI_URL'], self.app.conf['DATACITE_URL'], doi_id) parsed_metadata = doi.parse_metadata(raw_metadata) self.assertEqual(raw_metadata, expected_response_content) self.assertEqual(parsed_metadata, expected_parsed_response) httpretty.disable() httpretty.reset() # clean up registered urls and request history
def task_process_new_citation(citation_change, force=False): """ Process new citation: - Retrieve metadata from doi.org """ canonical_citing_bibcode = api.get_canonical_bibcode( app, citation_change.citing) if canonical_citing_bibcode is None: logger.error( "The citing bibcode '%s' is not in the system yet, it will be skipped in this ingestion", citation_change.citing) return content_type = None is_link_alive = False status = u"DISCARDED" # Check if we already have the citation target in the DB metadata = db.get_citation_target_metadata(app, citation_change.content) citation_target_in_db = bool(metadata) # False if dict is empty raw_metadata = metadata.get('raw', None) parsed_metadata = metadata.get('parsed', {}) if citation_target_in_db: status = metadata.get( 'status', u'DISCARDED') # "REGISTERED" if it is a software record if citation_change.content_type == adsmsg.CitationChangeContentType.doi \ and citation_change.content not in ["", None]: # Default values content_type = u"DOI" # if not citation_target_in_db: # Fetch DOI metadata (if HTTP request fails, an exception is raised # and the task will be re-queued (see app.py and adsputils)) raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], citation_change.content) if raw_metadata: parsed_metadata = doi.parse_metadata(raw_metadata) is_software = parsed_metadata.get('doctype', u'').lower() == "software" if parsed_metadata.get('bibcode') not in (None, "") and is_software: status = u"REGISTERED" elif citation_change.content_type == adsmsg.CitationChangeContentType.pid \ and citation_change.content not in ["", None]: content_type = u"PID" status = None is_link_alive = url.is_alive(app.conf['ASCL_URL'] + citation_change.content) parsed_metadata = {'link_alive': is_link_alive} elif citation_change.content_type == adsmsg.CitationChangeContentType.url \ and citation_change.content not in ["", None]: content_type = u"URL" status = None is_link_alive = url.is_alive(citation_change.content) parsed_metadata = {'link_alive': is_link_alive} else: logger.error( "Citation change should have doi, pid or url informed: {}", citation_change) status = None if status is not None: if not citation_target_in_db: # Create citation target in the DB target_stored = db.store_citation_target(app, citation_change, content_type, raw_metadata, parsed_metadata, status) if status == u"REGISTERED": if citation_change.content_type == adsmsg.CitationChangeContentType.doi: if canonical_citing_bibcode != citation_change.citing: # These two bibcodes are identical and we can signal the broker event_data = webhook.identical_bibcodes_event_data( citation_change.citing, canonical_citing_bibcode) if event_data: dump_prefix = citation_change.timestamp.ToDatetime( ).strftime("%Y%m%d_%H%M%S") logger.debug( "Calling 'task_emit_event' for '%s' IsIdenticalTo '%s'", citation_change.citing, canonical_citing_bibcode) task_emit_event.delay(event_data, dump_prefix) citation_target_bibcode = parsed_metadata.get('bibcode') # The new bibcode and the DOI are identical event_data = webhook.identical_bibcode_and_doi_event_data( citation_target_bibcode, citation_change.content) if event_data: dump_prefix = citation_change.timestamp.ToDatetime( ).strftime("%Y%m%d_%H%M%S") logger.debug( "Calling 'task_emit_event' for '%s' IsIdenticalTo '%s'", citation_target_bibcode, citation_change.content) task_emit_event.delay(event_data, dump_prefix) # Get citations from the database and transform the stored bibcodes into their canonical ones as registered in Solr. original_citations = db.get_citations_by_bibcode( app, citation_target_bibcode) citations = api.get_canonical_bibcodes(app, original_citations) # Add canonical bibcode of current detected citation if canonical_citing_bibcode and canonical_citing_bibcode not in citations: citations.append(canonical_citing_bibcode) logger.debug("Calling 'task_output_results' with '%s'", citation_change) task_output_results.delay(citation_change, parsed_metadata, citations) logger.debug("Calling '_emit_citation_change' with '%s'", citation_change) _emit_citation_change(citation_change, parsed_metadata) # Store the citation at the very end, so that if an exception is raised before # this task can be re-run in the future without key collisions in the database stored = db.store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status)
def task_maintenance_metadata(dois, bibcodes): """ Maintenance operation: - Get all the registered citation targets (or only a subset of them if DOIs and/or bibcodes are specified) - For each, retreive metadata and if it is different to what we have in our database: - Get the citations bibcodes and transform them to their canonical form - Send to master an update with the new metadata and the current list of citations canonical bibcodes """ n_requested = len(dois) + len(bibcodes) if n_requested == 0: registered_records = db.get_citation_targets(app, only_registered=True) else: registered_records = db.get_citation_targets_by_bibcode( app, bibcodes, only_registered=True) registered_records += db.get_citation_targets_by_doi( app, dois, only_registered=True) registered_records = _remove_duplicated_dict_in_list( registered_records) for registered_record in registered_records: updated = False bibcode_replaced = {} # Fetch DOI metadata (if HTTP request fails, an exception is raised # and the task will be re-queued (see app.py and adsputils)) raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'], app.conf['DATACITE_URL'], registered_record['content']) if raw_metadata: parsed_metadata = doi.parse_metadata(raw_metadata) is_software = parsed_metadata.get('doctype', u'').lower() == "software" if not is_software: logger.error( "The new metadata for '%s' has changed its 'doctype' and it is not 'software' anymore", registered_record['bibcode']) elif parsed_metadata.get('bibcode') in (None, ""): logger.error( "The new metadata for '%s' affected the metadata parser and it did not correctly compute a bibcode", registered_record['bibcode']) else: # Detect concept DOIs: they have one or more versions of the software # and they are not a version of something else concept_doi = len(parsed_metadata.get( 'version_of', [])) == 0 and len(parsed_metadata.get('versions', [])) >= 1 different_bibcodes = registered_record[ 'bibcode'] != parsed_metadata['bibcode'] if concept_doi and different_bibcodes: # Concept DOI publication date changes with newer software version # and authors can also change (i.e., first author last name initial) # but we want to respect the year in the bibcode, which corresponds # to the year of the latest release when it was first ingested # by ADS parsed_metadata['bibcode'] = registered_record['bibcode'] # Temporary bugfix (some bibcodes have non-capital letter at the end): parsed_metadata['bibcode'] = parsed_metadata[ 'bibcode'][:-1] + parsed_metadata['bibcode'][-1].upper( ) # Re-verify if bibcodes are still different (they could be if # name parsing has changed): different_bibcodes = registered_record[ 'bibcode'] != parsed_metadata['bibcode'] if different_bibcodes: # These two bibcodes are identical and we can signal the broker event_data = webhook.identical_bibcodes_event_data( registered_record['bibcode'], parsed_metadata['bibcode']) if event_data: dump_prefix = citation_change.timestamp.ToDatetime( ).strftime("%Y%m%d") # "%Y%m%d_%H%M%S" logger.debug( "Calling 'task_emit_event' for '%s' IsIdenticalTo '%s'", registered_record['bibcode'], parsed_metadata['bibcode']) task_emit_event.delay(event_data, dump_prefix) # logger.warn( "Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'], parsed_metadata.get('bibcode', None)) alternate_bibcode = parsed_metadata.get( 'alternate_bibcode', []) alternate_bibcode += registered_record.get( 'alternate_bibcode', []) if registered_record['bibcode'] not in alternate_bibcode: alternate_bibcode.append(registered_record['bibcode']) parsed_metadata['alternate_bibcode'] = alternate_bibcode bibcode_replaced = { 'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } updated = db.update_citation_target_metadata( app, registered_record['bibcode'], raw_metadata, parsed_metadata) if updated: citation_change = adsmsg.CitationChange( content=registered_record['content'], content_type=getattr( adsmsg.CitationChangeContentType, registered_record['content_type'].lower()), status=adsmsg.Status.updated, timestamp=datetime.now()) if citation_change.content_type == adsmsg.CitationChangeContentType.doi: # Get citations from the database and transform the stored bibcodes into their canonical ones as registered in Solr. original_citations = db.get_citations_by_bibcode( app, registered_record['bibcode']) citations = api.get_canonical_bibcodes(app, original_citations) logger.debug("Calling 'task_output_results' with '%s'", citation_change) task_output_results.delay(citation_change, parsed_metadata, citations, bibcode_replaced=bibcode_replaced)
def task_maintenance_reevaluate(dois, bibcodes): """ Maintenance operation: - Get all the registered citation targets (or only a subset of them if DOIs and/or bibcodes are specified) - For each, retreive metadata and if it is different to what we have in our database: - Get the citations bibcodes and transform them to their canonical form - Send to master an update with the new metadata and the current list of citations canonical bibcodes """ n_requested = len(dois) + len(bibcodes) if n_requested == 0: discarded_records = db.get_citation_targets(app, only_status='DISCARDED') else: discarded_records = db.get_citation_targets_by_bibcode( app, bibcodes, only_status='DISCARDED') discarded_records += db.get_citation_targets_by_doi( app, dois, only_status='DISCARDED') discarded_records = _remove_duplicated_dict_in_list(discarded_records) for previously_discarded_record in discarded_records: updated = False bibcode_replaced = {} # Fetch DOI metadata (if HTTP request fails, an exception is raised # and the task will be re-queued (see app.py and adsputils)) raw_metadata = doi.fetch_metadata( app.conf['DOI_URL'], app.conf['DATACITE_URL'], previously_discarded_record['content']) if raw_metadata: parsed_metadata = doi.parse_metadata(raw_metadata) is_software = parsed_metadata.get('doctype', u'').lower() == "software" if not is_software: logger.error("Discarded '%s', it is not 'software'", previously_discarded_record['content']) elif parsed_metadata.get('bibcode') in (None, ""): logger.error( "The metadata for '%s' could not be parsed correctly and it did not correctly compute a bibcode", previously_discarded_record['content']) else: # Create citation target in the DB updated = db.update_citation_target_metadata( app, previously_discarded_record['content'], raw_metadata, parsed_metadata, status='REGISTERED') if updated: db.mark_all_discarded_citations_as_registered( app, previously_discarded_record['content']) if updated: citation_change = adsmsg.CitationChange( content=previously_discarded_record['content'], content_type=getattr( adsmsg.CitationChangeContentType, previously_discarded_record['content_type'].lower()), status=adsmsg.Status.new, timestamp=datetime.now()) if citation_change.content_type == adsmsg.CitationChangeContentType.doi: # Get citations from the database and transform the stored bibcodes into their canonical ones as registered in Solr. original_citations = db.get_citations_by_bibcode( app, parsed_metadata['bibcode']) citations = api.get_canonical_bibcodes(app, original_citations) logger.debug("Calling 'task_output_results' with '%s'", citation_change) task_output_results.delay(citation_change, parsed_metadata, citations, bibcode_replaced=bibcode_replaced)