def _print_record(bibcode): with app.session_scope() as session: print 'stored by us:', bibcode r = session.query(Records).filter_by(bibcode=bibcode).first() if r: print json.dumps(r.toJSON(), indent=2, default=str, sort_keys=True) else: print 'None' print '-' * 80 print 'as seen by SOLR' solr_doc = solr_updater.transform_json_record(r.toJSON()) print json.dumps(solr_doc, indent=2, default=str, sort_keys=True) print '=' * 80
def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True, update_links=True, commit=False, ignore_checksums=False, solr_targets=None, update_timestamps=True): """Receives the bibcode of a document that was updated. (note: we could have sent the full record however we don't do it because the messages might be delayed and we can have multiple workers updating the same record; so we want to look into the database and get the most recent version) Receives bibcodes and checks the database if we have all the necessary pieces to push to solr. If not, then postpone and push later. We consider a record to be 'ready' if those pieces were updated (and were updated later than the last 'processed' timestamp): - bib_data - nonbib_data - orcid_claims 'fulltext' is not considered essential; but updates to fulltext will trigger a solr_update (so it might happen that a document will get indexed twice; first with only metadata and later on incl fulltext) """ if isinstance(bibcodes, basestring): bibcodes = [bibcodes] if not (update_solr or update_metrics or update_links): raise Exception('Hmmm, I dont think I let you do NOTHING, sorry!') logger.debug('Running index-records for: %s', bibcodes) batch = [] batch_insert = [] batch_update = [] links_data = [] links_url = app.conf.get('LINKS_RESOLVER_UPDATE_URL') #check if we have complete record for bibcode in bibcodes: r = app.get_record(bibcode) if r is None: logger.error('The bibcode %s doesn\'t exist!', bibcode) continue augments_updated = r.get('augments_updated', None) bib_data_updated = r.get('bib_data_updated', None) fulltext_updated = r.get('fulltext_updated', None) metrics_updated = r.get('metrics_updated', None) nonbib_data_updated = r.get('nonbib_data_updated', None) orcid_claims_updated = r.get('orcid_claims_updated', None) year_zero = '1972' processed = r.get('processed', adsputils.get_date(year_zero)) if processed is None: processed = adsputils.get_date(year_zero) is_complete = all([bib_data_updated, orcid_claims_updated, nonbib_data_updated]) if is_complete or (force is True and bib_data_updated): if force is False and all([ augments_updated and augments_updated < processed, bib_data_updated and bib_data_updated < processed, nonbib_data_updated and nonbib_data_updated < processed, orcid_claims_updated and orcid_claims_updated < processed ]): logger.debug('Nothing to do for %s, it was already indexed/processed', bibcode) continue if force: logger.debug('Forced indexing of: %s (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s, augments=%s)' % \ (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, \ metrics_updated, augments_updated)) # build the solr record if update_solr: d = solr_updater.transform_json_record(r) logger.debug('Built SOLR: %s', d) if ignore_checksums or r.get('solr_checksum', None) != app.checksum(d): batch.append(d) else: logger.debug('Checksum identical, skipping solr update for: %s', bibcode) # get data for metrics if update_metrics: m = r.get('metrics', None) if (m and ignore_checksums) or (m and r.get('metrics_checksum', None) != app.checksum(m)): m['bibcode'] = bibcode logger.debug('Got metrics: %s', m) if r.get('processed'): batch_update.append(m) else: batch_insert.append(m) else: logger.debug('Checksum identical, skipping metrics update for: %s', bibcode) if update_links and links_url: links = app.generate_links_for_resolver(r) if links: checksum = app.checksum(links) if ignore_checksums or r.get('datalinks_checksum', None) != checksum: links_data.append(links) else: # if forced and we have at least the bib data, index it if force is True: logger.warn('%s is missing bib data, even with force=True, this cannot proceed', bibcode) else: logger.debug('%s not ready for indexing yet (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s, augments=%s)' % \ (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, \ metrics_updated, augments_updated)) if batch or batch_insert or batch_update or links_data: app.update_remote_targets(solr=batch, metrics=(batch_insert, batch_update), links=links_data, commit_solr=commit, solr_urls=solr_targets, update_timestamps=update_timestamps)
def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True, update_links=True, commit=False, ignore_checksums=False, solr_targets=None, update_processed=True, priority=0): """Receives bibcodes that need production store updated Receives bibcodes and checks the database if we have all the necessary pieces to push to production store. If not, then postpone and send later. we consider a record to be ready for solr if these pieces were updated (and were updated later than the last 'processed' timestamp): - bib_data - nonbib_data - orcid_claims if the force flag is true only bib_data is needed for solr, 'fulltext' is not considered essential; but updates to fulltext will trigger a solr_update (so it might happen that a document will get indexed twice; first with only metadata and later on incl fulltext) """ if isinstance(bibcodes, basestring): bibcodes = [bibcodes] if not (update_solr or update_metrics or update_links): raise Exception('Hmmm, I dont think I let you do NOTHING, sorry!') logger.debug('Running index-records for: %s', bibcodes) solr_records = [] metrics_records = [] links_data_records = [] solr_records_checksum = [] metrics_records_checksum = [] links_data_records_checksum = [] links_url = app.conf.get('LINKS_RESOLVER_UPDATE_URL') if update_solr: fields = None # Load all the fields since solr records grab data from almost everywhere else: # Optimization: load only fields that will be used fields = [ 'bibcode', 'augments_updated', 'bib_data_updated', 'fulltext_updated', 'metrics_updated', 'nonbib_data_updated', 'orcid_claims_updated', 'processed' ] if update_metrics: fields += ['metrics', 'metrics_checksum'] if update_links: fields += ['nonbib_data', 'bib_data', 'datalinks_checksum'] # check if we have complete record for bibcode in bibcodes: r = app.get_record(bibcode, load_only=fields) if r is None: logger.error('The bibcode %s doesn\'t exist!', bibcode) continue augments_updated = r.get('augments_updated', None) bib_data_updated = r.get('bib_data_updated', None) fulltext_updated = r.get('fulltext_updated', None) metrics_updated = r.get('metrics_updated', None) nonbib_data_updated = r.get('nonbib_data_updated', None) orcid_claims_updated = r.get('orcid_claims_updated', None) year_zero = '1972' processed = r.get('processed', adsputils.get_date(year_zero)) if processed is None: processed = adsputils.get_date(year_zero) is_complete = all( [bib_data_updated, orcid_claims_updated, nonbib_data_updated]) if is_complete or (force is True and bib_data_updated): if force is False and all([ augments_updated and augments_updated < processed, bib_data_updated and bib_data_updated < processed, nonbib_data_updated and nonbib_data_updated < processed, orcid_claims_updated and orcid_claims_updated < processed ]): logger.debug( 'Nothing to do for %s, it was already indexed/processed', bibcode) continue if force: logger.debug( 'Forced indexing of: %s (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s, augments=%s)' % (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, metrics_updated, augments_updated)) # build the solr record if update_solr: solr_payload = solr_updater.transform_json_record(r) logger.debug('Built SOLR: %s', solr_payload) solr_checksum = app.checksum(solr_payload) if ignore_checksums or r.get('solr_checksum', None) != solr_checksum: solr_records.append(solr_payload) solr_records_checksum.append(solr_checksum) else: logger.debug( 'Checksum identical, skipping solr update for: %s', bibcode) # get data for metrics if update_metrics: metrics_payload = r.get('metrics', None) metrics_checksum = app.checksum(metrics_payload or '') if (metrics_payload and ignore_checksums) or (metrics_payload and r.get( 'metrics_checksum', None) != metrics_checksum): metrics_payload['bibcode'] = bibcode logger.debug('Got metrics: %s', metrics_payload) metrics_records.append(metrics_payload) metrics_records_checksum.append(metrics_checksum) else: logger.debug( 'Checksum identical or no metrics data available, skipping metrics update for: %s', bibcode) if update_links and links_url: datalinks_payload = app.generate_links_for_resolver(r) if datalinks_payload: datalinks_checksum = app.checksum(datalinks_payload) if ignore_checksums or r.get('datalinks_checksum', None) != datalinks_checksum: links_data_records.append(datalinks_payload) links_data_records_checksum.append(datalinks_checksum) else: # if forced and we have at least the bib data, index it if force is True: logger.warn( '%s is missing bib data, even with force=True, this cannot proceed', bibcode) else: logger.debug( '%s not ready for indexing yet (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s, augments=%s)' % (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, metrics_updated, augments_updated)) if solr_records: task_index_solr.apply_async(args=( solr_records, solr_records_checksum, ), kwargs={ 'commit': commit, 'solr_targets': solr_targets, 'update_processed': update_processed }) if metrics_records: task_index_metrics.apply_async( args=( metrics_records, metrics_records_checksum, ), kwargs={'update_processed': update_processed}) if links_data_records: task_index_data_links_resolver.apply_async( args=( links_data_records, links_data_records_checksum, ), kwargs={'update_processed': update_processed})
def task_index_records(bibcodes, force=False, update_solr=True, update_metrics=True, commit=False): """ This task is (normally) called by the cronjob task (that one, quite obviously, is in turn started by cron) Receives the bibcode of a document that was updated. (note: we could have sent the full record however we don't do it because the messages might be delayed and we can have multiple workers updating the same record; so we want to look into the database and get the most recent version) Receives bibcodes and checks the database if we have all the necessary pieces to push to solr. If not, then postpone and push later. We consider a record to be 'ready' if those pieces were updated (and were updated later than the last 'processed' timestamp): - bib_data - nonbib_data - orcid_claims 'fulltext' is not considered essential; but updates to fulltext will trigger a solr_update (so it might happen that a document will get indexed twice; first with only metadata and later on incl fulltext) """ if not (update_solr or update_metrics): raise Exception('Hmmm, I dont think I let you do NOTHING, sorry!') logger.debug('Running index-records for: %s', bibcodes) batch = [] batch_insert = [] batch_update = [] #check if we have complete record for bibcode in bibcodes: r = app.get_record(bibcode) if r is None: logger.error('The bibcode %s doesn\'t exist!', bibcode) continue bib_data_updated = r.get('bib_data_updated', None) orcid_claims_updated = r.get('orcid_claims_updated', None) nonbib_data_updated = r.get('nonbib_data_updated', None) fulltext_updated = r.get('fulltext_updated', None) metrics_updated = r.get('metrics_updated', None) year_zero = '1972' processed = r.get('processed', adsputils.get_date(year_zero)) if processed is None: # It was never sent to Solr processed = adsputils.get_date(year_zero) is_complete = all( [bib_data_updated, orcid_claims_updated, nonbib_data_updated]) if is_complete or (force is True and bib_data_updated): if force is False and all([ bib_data_updated and bib_data_updated < processed, orcid_claims_updated and orcid_claims_updated < processed, nonbib_data_updated and nonbib_data_updated < processed ]): logger.debug( 'Nothing to do for %s, it was already indexed/processed', bibcode) continue if force: logger.warn('Forced indexing of: %s (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s)' % \ (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, \ metrics_updated)) # build the solr record if update_solr: d = solr_updater.transform_json_record(r) logger.debug('Built SOLR: %s', d) batch.append(d) # get data for metrics if update_metrics: m = r.get('metrics', None) if m: m['bibcode'] = bibcode logger.debug('Got metrics: %s', m) if r.get('processed'): batch_update.append(m) else: batch_insert.append(m) else: # if forced and we have at least the bib data, index it if force is True: logger.warn( '%s is missing bib data, even with force=True, this cannot proceed', bibcode) else: logger.debug('%s not ready for indexing yet', bibcode) failed_bibcodes = None if len(batch): failed_bibcodes = app.reindex(batch, app.conf.get('SOLR_URLS'), commit=commit) if failed_bibcodes and len(failed_bibcodes): logger.warn('Some bibcodes failed: %s', failed_bibcodes) failed_bibcodes = set(failed_bibcodes) # when solr_urls > 1, some of the servers may have successfully indexed # but here we are refusing to pass data to metrics db; this seems the # right choice because there is only one metrics db (but if we had many, # then we could differentiate) batch_insert = filter(lambda x: x['bibcode'] not in failed_bibcodes, batch_insert) batch_update = filter(lambda x: x['bibcode'] not in failed_bibcodes, batch_update) if len(batch_insert) or len(batch_update): app.update_metrics_db(batch_insert, batch_update)
def test_solr_transformer(self): """Makes sure we can write recs into the storage.""" self.app.update_storage('bibcode', 'metadata', {u'abstract': u'abstract text', u'aff': [u'-', u'-', u'-', u'-'], u'alternate_bibcode': [u'2003adass..12..283B'], u'author': [u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.', u'Winkelman, S.'], u'author_count': 4, u'author_facet': [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'], u'author_facet_hier': [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.', u'0/Paltani, S', u'1/Paltani, S/Paltani, S.', u'0/Rots, A', u'1/Rots, A/Rots, A.', u'0/Winkelman, S', u'1/Winkelman, S/Winkelman, S.'], u'author_norm': [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'], u'bibcode': u'2003ASPC..295..283B', u'bibgroup': [u'CXC', u'CfA'], u'bibgroup_facet': [u'CXC', u'CfA'], u'bibstem': [u'ASPC', u'ASPC..295'], u'bibstem_facet': u'ASPC', u'database': [u'astronomy'], u'date': u'2003-01-01T00:00:00.000000Z', u'doctype': u'inproceedings', u'doctype_facet_hier': [u'0/Article', u'1/Article/Proceedings Article'], u'email': [u'-', u'-', u'-', u'-'], u'first_author': u'Blecksmith, E.', u'first_author_facet_hier': [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.'], u'first_author_norm': u'Blecksmith, E', u'id': u'1401492', u'identifier': [u'2003adass..12..283B'], u'links_data': u'', ### TODO(rca): superconfusing string, but fortunately we are getting ridd of it u'orcid_pub': [u'-', u'-', u'-', u'-'], u'page': [u'283'], u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'], u'pub': u'Astronomical Data Analysis Software and Systems XII', u'pub_raw': u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283', u'pubdate': u'2003-00-00', u'title': [u'Chandra Data Archive Download and Usage Database'], u'volume': u'295', u'year': u'2003'}) self.app.update_storage('bibcode', 'fulltext', 'fulltext') self.app.update_storage('bibcode', 'metrics', {"downloads": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 2], "bibcode": "2003ASPC..295..361M", "reads": [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 2, 5, 1, 0, 0, 1, 0, 0, 2, 4, 5], "author_num": 2}) self.app.update_storage('bibcode', 'orcid_claims', {'authors': ['Blecksmith, E.', 'Paltani, S.', 'Rots, A.', 'Winkelman, S.'], 'bibcode': '2003ASPC..295..283B', 'unverified': ['-', '-', '0000-0003-2377-2356', '-']}) self.app.update_storage('bibcode', 'nonbib_data', {u'authors': [u'Zaus, E', u'Tedde, S', u'Fuerst, J', u'Henseler, D', u'Doehler, G'], u'bibcode': u'2007JAP...101d4501Z', u'boost': 0.1899999976158142, u'citation_count': 6, u'citations': [u'2007ApPhL..91g1118P', u'2010ApPhA..99..805K', u'2011TSF...520..610L', u'2012NatCo...3E1175B', u'2014IPTL...26..305A', u'2016ITED...63..197G'], u'downloads': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], u'id': 7862455, u'norm_cites': 4225, u'reads': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 2, 1, 0, 0, 1, 0, 1, 0, 0], u'refereed': True, u'reference': [u'1977JAP....48.4729M', u'1981psd..book.....S', u'1981wi...book.....S', u'1986PhRvB..33.5545M', u'1987ApPhL..51..913T', u'1992Sci...258.1474S', u'1994IJMPB...8..237S', u'1995Natur.376..498H', u'1995Sci...270.1789Y', u'1998TSF...331...76O', u'1999Natur.397..121F', u'2000JaJAP..39...94P', u'2002ApPhL..81.3885S', u'2004ApPhL..85.3890C', u'2004TSF...451..105S', u'2005PhRvB..72s5208M', u'2006ApPhL..89l3505L']}) rec = self.app.get_record('bibcode') self.assertDictContainsSubset({u'abstract': u'abstract text', u'aff': [u'-', u'-', u'-', u'-'], u'alternate_bibcode': [u'2003adass..12..283B'], u'author': [u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.', u'Winkelman, S.'], u'author_count': 4, u'author_facet': [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'], u'author_facet_hier': [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.', u'0/Paltani, S', u'1/Paltani, S/Paltani, S.', u'0/Rots, A', u'1/Rots, A/Rots, A.', u'0/Winkelman, S', u'1/Winkelman, S/Winkelman, S.'], u'author_norm': [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'], 'bibcode': u'2003ASPC..295..283B', u'bibgroup': [u'CXC', u'CfA'], u'bibgroup_facet': [u'CXC', u'CfA'], u'bibstem': [u'ASPC', u'ASPC..295'], u'bibstem_facet': u'ASPC', 'body': u'fulltext', 'citation': [u'2007ApPhL..91g1118P', u'2010ApPhA..99..805K', u'2011TSF...520..610L', u'2012NatCo...3E1175B', u'2014IPTL...26..305A', u'2016ITED...63..197G'], 'citation_count': 6, 'cite_read_boost': 0.1899999976158142, u'database': [u'astronomy'], u'date': u'2003-01-01T00:00:00.000000Z', u'doctype': u'inproceedings', u'doctype_facet_hier': [u'0/Article', u'1/Article/Proceedings Article'], u'email': [u'-', u'-', u'-', u'-'], u'first_author': u'Blecksmith, E.', u'first_author_facet_hier': [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.'], u'first_author_norm': u'Blecksmith, E', 'id': u'1401492', u'identifier': [u'2003adass..12..283B'], u'links_data': u'', 'orcid_other' : [u'-', u'-', u'0000-0003-2377-2356', u'-'], u'orcid_pub': [u'-', u'-', u'-', u'-'], u'page': [u'283'], u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'], u'pub': u'Astronomical Data Analysis Software and Systems XII', u'pub_raw': u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283', u'pubdate': u'2003-00-00', 'reference': [u'1977JAP....48.4729M', u'1981psd..book.....S', u'1981wi...book.....S', u'1986PhRvB..33.5545M', u'1987ApPhL..51..913T', u'1992Sci...258.1474S', u'1994IJMPB...8..237S', u'1995Natur.376..498H', u'1995Sci...270.1789Y', u'1998TSF...331...76O', u'1999Natur.397..121F', u'2000JaJAP..39...94P', u'2002ApPhL..81.3885S', u'2004ApPhL..85.3890C', u'2004TSF...451..105S', u'2005PhRvB..72s5208M', u'2006ApPhL..89l3505L'], u'title': [u'Chandra Data Archive Download and Usage Database'], u'volume': u'295', u'year': u'2003'}, solr_updater.transform_json_record(rec)) for x in Records._date_fields: if x in rec: rec[x] = get_date('2017-09-19T21:17:12.026474+00:00') x = solr_updater.transform_json_record(rec) for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime', 'nonbib_mtime', 'metrics_mtime', 'update_timestamp'): self.assertEquals(x[f], '2017-09-19T21:17:12.026474Z') rec['orcid_claims_updated'] = get_date('2017-09-20T21:17:12.026474+00:00') x = solr_updater.transform_json_record(rec) for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime', 'nonbib_mtime', 'metrics_mtime', 'update_timestamp'): if f == 'update_timestamp' or f == 'orcid_mtime': self.assertEquals(x[f], '2017-09-20T21:17:12.026474Z') else: self.assertEquals(x[f], '2017-09-19T21:17:12.026474Z')
def test_solr_transformer(self): """Makes sure we can write recs into the storage.""" self.app.update_storage('bibcode', 'metadata', {u'abstract': u'abstract text', u'aff': [u'-', u'-', u'-', u'-'], u'alternate_bibcode': [u'2003adass..12..283B'], u'author': [u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.', u'Winkelman, S.'], u'author_count': 4, u'author_facet': [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'], u'author_facet_hier': [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.', u'0/Paltani, S', u'1/Paltani, S/Paltani, S.', u'0/Rots, A', u'1/Rots, A/Rots, A.', u'0/Winkelman, S', u'1/Winkelman, S/Winkelman, S.'], u'author_norm': [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'], u'bibcode': u'2003ASPC..295..283B', u'bibgroup': [u'bibCXC', u'CfA'], u'bibgroup_facet': [u'bibCXC', u'CfA'], u'bibstem': [u'ASPC', u'ASPC..295'], u'bibstem_facet': u'ASPC', u'database': [u'astronomy'], u'date': u'2003-01-01T00:00:00.000000Z', u'doctype': u'inproceedings', u'doctype_facet_hier': [u'0/Article', u'1/Article/Proceedings Article'], u'editor': [u'Testeditor, Z.'], u'email': [u'-', u'-', u'-', u'-'], u'first_author': u'Blecksmith, E.', u'first_author_facet_hier': [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.'], u'first_author_norm': u'Blecksmith, E', u'id': u'1401492', u'identifier': [u'2003adass..12..283B'], u'links_data': u'', ### TODO(rca): superconfusing string, but fortunately we are getting ridd of it u'orcid_pub': [u'-', u'-', u'-', u'-'], u'page': [u'283'], #u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'], u'pub': u'Astronomical Data Analysis Software and Systems XII', u'pub_raw': u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283', u'pubdate': u'2003-00-00', u'title': [u'Chandra Data Archive Download and Usage Database'], u'volume': u'295', u'year': u'2003'}) self.app.update_storage('bibcode', 'fulltext', {'body': 'texttext', 'acknowledgements': 'aaa', 'dataset': ['a', 'b', 'c'], 'facility': ['fac1', 'fac2', 'fac3']}) self.app.update_storage('bibcode', 'metrics', {"downloads": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 2], "bibcode": "2003ASPC..295..361M", "reads": [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 2, 5, 1, 0, 0, 1, 0, 0, 2, 4, 5], "author_num": 2}) self.app.update_storage('bibcode', 'orcid_claims', {'authors': ['Blecksmith, E.', 'Paltani, S.', 'Rots, A.', 'Winkelman, S.'], 'bibcode': '2003ASPC..295..283B', 'unverified': ['-', '-', '0000-0003-2377-2356', '-']}) self.app.update_storage('bibcode', 'metrics', { u'citation_num': 6, u'citations': [u'2007ApPhL..91g1118P', u'2010ApPhA..99..805K', u'2011TSF...520..610L', u'2012NatCo...3E1175B', u'2014IPTL...26..305A', u'2016ITED...63..197G']}) self.app.update_storage('bibcode', 'nonbib_data', {u'authors': [u'Zaus, E', u'Tedde, S', u'Fuerst, J', u'Henseler, D', u'Doehler, G'], u'bibcode': u'2007JAP...101d4501Z', u'bibgroup': [u'CXC', u'CfA'], u'bibgroup_facet': [u'CXC', u'CfA'], u'boost': 0.1899999976158142, u'data': [u'MAST:3', u'SIMBAD:1'], u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'], u'downloads': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], u'id': 7862455, u'norm_cites': 4225, u'reads': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 2, 1, 0, 0, 1, 0, 1, 0, 0], u'refereed': True, u'reference': [u'1977JAP....48.4729M', u'1981psd..book.....S', u'1981wi...book.....S', u'1986PhRvB..33.5545M', u'1987ApPhL..51..913T', u'1992Sci...258.1474S', u'1994IJMPB...8..237S', u'1995Natur.376..498H', u'1995Sci...270.1789Y', u'1998TSF...331...76O', u'1999Natur.397..121F', u'2000JaJAP..39...94P', u'2002ApPhL..81.3885S', u'2004ApPhL..85.3890C', u'2004TSF...451..105S', u'2005PhRvB..72s5208M', u'2006ApPhL..89l3505L'], u'simbad_objects': [u'2419335 sim', u'3111723 sim*'], u'ned_objects': [u'2419335 HII', u'3111723 ned*'], u'grants': [u'2419335 g', u'3111723 g*'], u'citation_count': 6, u'citation_count_norm': .2, }) rec = self.app.get_record('bibcode') x = solr_updater.transform_json_record(rec) # self.assertFalse('aff' in x, 'virtual field should not be in solr output') self.assertTrue(x['aff'] == rec['bib_data']['aff'], 'solr record should include aff from bib data when augment is not available') self.assertFalse('aff_abbrev' in x, 'augment field should not be in solr record when augment is not available') self.app.update_storage('bibcode', 'augment', {u'aff': [u'augment pipeline aff', u'-', u'-', u'-'], u'aff_abbrev': [u'-', u'-', u'-', u'-'], u'aff_canonical': [u'-', u'-', u'-', u'-'], u'aff_facet': [u'-', u'-', u'-', u'-'], u'aff_facet_hier': [u'-', u'-', u'-', u'-'], u'aff_id': [u'-', u'-', u'-', u'-'], u'institution': [u'-', u'-', u'-', u'-']}) rec = self.app.get_record('bibcode') self.assertDictContainsSubset({u'abstract': u'abstract text', u'ack': u'aaa', u'aff_abbrev': [u'-', u'-', u'-', u'-'], u'aff_canonical': [u'-', u'-', u'-', u'-'], u'aff_facet': [u'-', u'-', u'-', u'-'], u'aff_facet_hier': [u'-', u'-', u'-', u'-'], u'aff_id': [u'-', u'-', u'-', u'-'], u'institution': [u'-', u'-', u'-', u'-'], u'alternate_bibcode': [u'2003adass..12..283B'], u'author': [u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.', u'Winkelman, S.'], u'author_count': 4, u'author_facet': [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'], u'author_facet_hier': [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.', u'0/Paltani, S', u'1/Paltani, S/Paltani, S.', u'0/Rots, A', u'1/Rots, A/Rots, A.', u'0/Winkelman, S', u'1/Winkelman, S/Winkelman, S.'], u'author_norm': [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'], 'bibcode': u'2003ASPC..295..283B', u'bibgroup': [u'CXC', u'CfA'], u'bibgroup_facet': [u'CXC', u'CfA'], u'bibstem': [u'ASPC', u'ASPC..295'], u'bibstem_facet': u'ASPC', 'body': u'texttext', 'citation': [u'2007ApPhL..91g1118P', u'2010ApPhA..99..805K', u'2011TSF...520..610L', u'2012NatCo...3E1175B', u'2014IPTL...26..305A', u'2016ITED...63..197G'], 'citation_count': 6, 'citation_count_norm': .2, 'cite_read_boost': 0.1899999976158142, u'data': [u'MAST:3', u'SIMBAD:1'], u'data_facet': [u'MAST', u'SIMBAD'], u'database': [u'astronomy'], #u'dataset': ['a', 'b', 'c'], u'date': u'2003-01-01T00:00:00.000000Z', u'doctype': u'inproceedings', u'doctype_facet_hier': [u'0/Article', u'1/Article/Proceedings Article'], u'editor': [u'Testeditor, Z.'], u'email': [u'-', u'-', u'-', u'-'], u'facility': ['fac1', 'fac2', 'fac3'], u'first_author': u'Blecksmith, E.', u'first_author_facet_hier': [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.'], u'first_author_norm': u'Blecksmith, E', u'id': 1, # from id in master database records table u'identifier': [u'2003adass..12..283B'], u'links_data': u'', 'orcid_other' : [u'-', u'-', u'0000-0003-2377-2356', u'-'], u'orcid_pub': [u'-', u'-', u'-', u'-'], u'nedid': [u'2419335', u'3111723'], u'nedtype': [u'HII Region', u'Other'], u'ned_object_facet_hier': [u'0/HII Region', u'1/HII Region/2419335', u'0/Other', u'1/Other/3111723'], u'page': [u'283'], u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'], u'pub': u'Astronomical Data Analysis Software and Systems XII', u'pub_raw': u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283', u'pubdate': u'2003-00-00', u'read_count': 0, 'reference': [u'1977JAP....48.4729M', u'1981psd..book.....S', u'1981wi...book.....S', u'1986PhRvB..33.5545M', u'1987ApPhL..51..913T', u'1992Sci...258.1474S', u'1994IJMPB...8..237S', u'1995Natur.376..498H', u'1995Sci...270.1789Y', u'1998TSF...331...76O', u'1999Natur.397..121F', u'2000JaJAP..39...94P', u'2002ApPhL..81.3885S', u'2004ApPhL..85.3890C', u'2004TSF...451..105S', u'2005PhRvB..72s5208M', u'2006ApPhL..89l3505L'], u'simbid': ['2419335', '3111723'], u'simbtype': [u'Other', u'Star'], u'simbad_object_facet_hier': [u'0/Other', u'1/Other/2419335', u'0/Star', u'1/Star/3111723'], u'title': [u'Chandra Data Archive Download and Usage Database'], u'volume': u'295', u'year': u'2003'}, solr_updater.transform_json_record(rec)) for x in Records._date_fields: if x in rec: rec[x] = get_date('2017-09-19T21:17:12.026474+00:00') x = solr_updater.transform_json_record(rec) for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime', 'nonbib_mtime', 'metrics_mtime', 'update_timestamp'): self.assertEqual(x[f], '2017-09-19T21:17:12.026474Z') rec['orcid_claims_updated'] = get_date('2017-09-20T21:17:12.026474+00:00') x = solr_updater.transform_json_record(rec) for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime', 'nonbib_mtime', 'metrics_mtime', 'update_timestamp'): if f == 'update_timestamp' or f == 'orcid_mtime': self.assertEqual(x[f], '2017-09-20T21:17:12.026474Z') else: self.assertEqual(x[f], '2017-09-19T21:17:12.026474Z') rec = self.app.get_record('bibcode') x = solr_updater.transform_json_record(rec) self.assertTrue('aff' in x) # aff is no longer a virtual field self.assertEqual(x['aff'], rec['augments']['aff']) # solr record should prioritize aff data from augment self.assertEqual(x['aff_abbrev'], rec['augments']['aff_abbrev']) # solr record should include augment data self.assertEqual(x['bibgroup'], rec['nonbib_data']['bibgroup']) self.assertEqual(x['bibgroup_facet'], rec['nonbib_data']['bibgroup_facet'])
def test_links_data_merge(self): # links_data only from bib db_record = {'bibcode': 'foo', 'bib_data': {'links_data': ['{"url": "http://asdf"}']}, 'bib_data_updated': datetime.now()} solr_record = solr_updater.transform_json_record(db_record) self.assertEqual(db_record['bib_data']['links_data'], solr_record['links_data']) db_record = {'bibcode': 'foo', 'bib_data': {'links_data': ['{"url": "http://asdf"}']}, 'bib_data_updated': datetime.now()} solr_record = solr_updater.transform_json_record(db_record) self.assertEqual(db_record['bib_data']['links_data'], solr_record['links_data']) # links_data only from nonbib db_record = {'bibcode': 'foo', 'nonbib_data': {'links_data': 'asdf'}, 'nonbib_data_updated': datetime.now()} solr_record = solr_updater.transform_json_record(db_record) self.assertEqual(db_record['nonbib_data']['links_data'], solr_record['links_data']) # links_data from both db_record = {'bibcode': 'foo', 'bib_data': {'links_data': 'asdf'}, 'bib_data_updated': datetime.now(), 'nonbib_data': {'links_data': 'jkl'}, 'nonbib_data_updated': datetime.now() - timedelta(1)} solr_record = solr_updater.transform_json_record(db_record) self.assertEqual(db_record['nonbib_data']['links_data'], solr_record['links_data']) db_record = {'bibcode': 'foo', 'bib_data': {'links_data': 'asdf'}, 'bib_data_updated': datetime.now() - timedelta(1), 'nonbib_data': {'links_data': 'jkl'}, 'nonbib_data_updated': datetime.now()} solr_record = solr_updater.transform_json_record(db_record) self.assertEqual(db_record['nonbib_data']['links_data'], solr_record['links_data']) db_record = {'bibcode': 'foo', 'bib_data': {'links_data': ['{"url": "http://foo", "access": "open"}']}, 'bib_data_updated': datetime.now()} solr_record = solr_updater.transform_json_record(db_record) self.assertTrue('ESOURCE' in solr_record['property']) # verify all values are populated self.assertTrue('ARTICLE' in solr_record['property']) self.assertTrue('NOT REFEREED' in solr_record['property']) self.assertTrue('EPRINT_OPENACCESS' in solr_record['property']) self.assertTrue('OPENACCESS' in solr_record['property']) self.assertTrue('EPRINT_HTML' in solr_record['esources']) self.assertTrue('EPRINT_PDF' in solr_record['esources']) db_record = {'bibcode': 'foo', 'bib_data': {'links_data': ['{"url": "http://foo", "access": "closed"}']}, 'bib_data_updated': datetime.now()} solr_record = solr_updater.transform_json_record(db_record) self.assertTrue('ESOURCE' not in solr_record['property']) db_record = {'bibcode': 'foo', 'bib_data': {}, 'bib_data_updated': datetime.now()} solr_record = solr_updater.transform_json_record(db_record) self.assertTrue('property' not in solr_record)