def refetch_orcidids(since=None, orcid_ids=None, **kwargs): """ Gets all orcidids that were updated since time X. :param: since - RFC889 formatted string :type: str :return: no return """ worker = RabbitMQWorker(params={ 'publish': 'ads.orcid.fresh-claims', 'exchange': app.config.get('EXCHANGE', 'ads-orcid') }) worker.connect(app.config.get('RABBITMQ_URL')) if orcid_ids: for oid in orcid_ids.split(','): worker.publish({'orcidid': oid, 'force': False}) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) logger.info('Re-fetching orcidids updated since: {0}'.format(from_date.isoformat())) # then get all new/old orcidids from orcid-service orcidids = set(updater.get_all_touched_profiles(from_date.isoformat())) from_date = get_date() for orcidid in orcidids: try: worker.publish({'orcidid': orcidid, 'force': False}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid worker.publish({'orcidid': orcidid, 'force': False}) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is None: kv = KeyValue(key='last.refetch', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
def repush_claims(since=None, **kwargs): """ Re-pushes all recs that were added since date 'X' to the output (i.e. forwards them onto the Solr queue) :param: since - RFC889 formatted string :type: str :return: no return """ logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.repush').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) orcidids = set() logger.info('Re-pushing records since: {0}'.format(from_date.isoformat())) worker = RabbitMQWorker(params={ 'publish': 'ads.orcid.output', 'exchange': app.config.get('EXCHANGE', 'ads-orcid') }) worker.connect(app.config.get('RABBITMQ_URL')) num_bibcodes = 0 with app.session_scope() as session: for rec in session.query(Records) \ .filter(Records.updated >= from_date) \ .order_by(Records.updated.asc()) \ .all(): data = rec.toJSON() try: worker.publish({'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims']}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying ', data['bibcode'] worker.publish({'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims']}) num_bibcodes += 1 with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.repush').first() if kv is None: kv = KeyValue(key='last.repush', value=get_date()) session.add(kv) else: kv.value = get_date() session.commit() logger.info('Done processing {0} orcid ids.'.format(num_bibcodes))
def test_get_date(self): """Check we always work with UTC dates""" d = utils.get_date() self.assertTrue(d.tzname() == 'UTC') d1 = utils.get_date('2009-09-04T01:56:35.450686Z') self.assertTrue(d1.tzname() == 'UTC') self.assertEqual(d1.isoformat(), '2009-09-04T01:56:35.450686+00:00') d2 = utils.get_date('2009-09-03T20:56:35.450686-05:00') self.assertTrue(d2.tzname() == 'UTC') self.assertEqual(d2.isoformat(), '2009-09-04T01:56:35.450686+00:00') d3 = utils.get_date('2009-09-03T20:56:35.450686') self.assertTrue(d3.tzname() == 'UTC') self.assertEqual(d3.isoformat(), '2009-09-03T20:56:35.450686+00:00')
def test_dates(self): '''We want to use only UTC dates''' with self.assertRaisesRegexp(Exception, 'ValueError'): with app.session_scope() as session: rec = Records(bibcode='foo', created='2009-09-03T20:56:35.450686Z') session.add(rec) rec.updated = datetime.now() session.commit() with app.session_scope() as session: rec = Records(bibcode='foo', created='2009-09-03T20:56:35.450686Z') session.add(rec) rec.updated = get_date() session.commit()
def reindex_claims(since=None, **kwargs): """ Re-runs all claims, both from the pipeline and from the orcid-service storage. :param: since - RFC889 formatted string :type: str :return: no return """ logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) orcidids = set() # trigger re-indexing worker = RabbitMQWorker(params={ 'publish': 'ads.orcid.fresh-claims', 'exchange': app.config.get('EXCHANGE', 'ads-orcid') }) worker.connect(app.config.get('RABBITMQ_URL')) logger.info('Loading records since: {0}'.format(from_date.isoformat())) # first re-check our own database (replay the logs) with app.session_scope() as session: for author in session.query(AuthorInfo.orcidid.distinct().label('orcidid')).all(): orcidid = author.orcidid if orcidid and orcidid.strip() != "": try: changed = updater.reindex_all_claims(orcidid, since=from_date.isoformat(), ignore_errors=True) if len(changed): orcidids.add(orcidid) worker.publish({'orcidid': orcidid, 'force': True}) except: print 'Error processing: {0}'.format(orcidid) traceback.print_exc() continue if len(orcidids) % 100 == 0: print 'Done replaying {0} profiles'.format(len(orcidids)) print 'Now harvesting orcid profiles...' # then get all new/old orcidids from orcid-service all_orcids = set(updater.get_all_touched_profiles(from_date.isoformat())) orcidids = all_orcids.difference(orcidids) from_date = get_date() for orcidid in orcidids: try: worker.publish({'orcidid': orcidid, 'force': True}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid worker.publish({'orcidid': orcidid, 'force': True}) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is None: kv = KeyValue(key='last.reindex', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))