def refetch_orcidids(since=None, orcid_ids=None, **kwargs): """ Gets all orcidids that were updated since time X. :param: since - RFC889 formatted string :type: str :return: no return """ worker = RabbitMQWorker(params={ 'publish': 'ads.orcid.fresh-claims', 'exchange': app.config.get('EXCHANGE', 'ads-orcid') }) worker.connect(app.config.get('RABBITMQ_URL')) if orcid_ids: for oid in orcid_ids.split(','): worker.publish({'orcidid': oid, 'force': False}) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) logger.info('Re-fetching orcidids updated since: {0}'.format(from_date.isoformat())) # then get all new/old orcidids from orcid-service orcidids = set(updater.get_all_touched_profiles(from_date.isoformat())) from_date = get_date() for orcidid in orcidids: try: worker.publish({'orcidid': orcidid, 'force': False}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid worker.publish({'orcidid': orcidid, 'force': False}) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is None: kv = KeyValue(key='last.refetch', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
def refetch_orcidids(since=None, orcid_ids=None, **kwargs): """ Gets all orcidids that were updated since time X. :param: since - RFC889 formatted string :type: str :return: no return """ if orcid_ids: for oid in orcid_ids: tasks.task_index_orcid_profile({'orcidid': oid, 'force': False}) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) logger.info('Re-fetching orcidids updated since: {0}'.format( from_date.isoformat())) # then get all new/old orcidids from orcid-service orcidids = set(updater.get_all_touched_profiles(app, from_date.isoformat())) from_date = get_date() for orcidid in orcidids: try: tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': False }) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': False }) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.refetch').first() if kv is None: kv = KeyValue(key='last.refetch', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
def repush_claims(since=None, **kwargs): """ Re-pushes all recs that were added since date 'X' to the output (i.e. forwards them onto the Solr queue) :param: since - RFC889 formatted string :type: str :return: no return """ logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.repush').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) orcidids = set() logger.info('Re-pushing records since: {0}'.format(from_date.isoformat())) worker = RabbitMQWorker(params={ 'publish': 'ads.orcid.output', 'exchange': app.config.get('EXCHANGE', 'ads-orcid') }) worker.connect(app.config.get('RABBITMQ_URL')) num_bibcodes = 0 with app.session_scope() as session: for rec in session.query(Records) \ .filter(Records.updated >= from_date) \ .order_by(Records.updated.asc()) \ .all(): data = rec.toJSON() try: worker.publish({'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims']}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying ', data['bibcode'] worker.publish({'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims']}) num_bibcodes += 1 with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.repush').first() if kv is None: kv = KeyValue(key='last.repush', value=get_date()) session.add(kv) else: kv.value = get_date() session.commit() logger.info('Done processing {0} orcid ids.'.format(num_bibcodes))
def reindex_claims(since=None, orcid_ids=None, **kwargs): """ Re-runs all claims, both from the pipeline and from the orcid-service storage. :param: since - RFC889 formatted string :type: str :return: no return """ if orcid_ids: for oid in orcid_ids: tasks.task_index_orcid_profile.delay({ 'orcidid': oid, 'force': True }) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) orcidids = set() logger.info('Loading records since: {0}'.format(from_date.isoformat())) # first re-check our own database (replay the logs) with app.session_scope() as session: for author in session.query( AuthorInfo.orcidid.distinct().label('orcidid')).all(): orcidid = author.orcidid if orcidid and orcidid.strip() != "": try: changed = updater.reindex_all_claims( app, orcidid, since=from_date.isoformat(), ignore_errors=True) if len(changed): orcidids.add(orcidid) tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': True }) except: print 'Error processing: {0}'.format(orcidid) traceback.print_exc() continue if len(orcidids) % 100 == 0: print 'Done replaying {0} profiles'.format(len(orcidids)) print 'Now harvesting orcid profiles...' # then get all new/old orcidids from orcid-service all_orcids = set( updater.get_all_touched_profiles(app, from_date.isoformat())) orcidids = all_orcids.difference(orcidids) from_date = get_date() for orcidid in orcidids: try: tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': True }) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid tasks.task_index_orcid_profile.delay({ 'orcidid': orcidid, 'force': True }) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is None: kv = KeyValue(key='last.reindex', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
def repush_claims(since=None, orcid_ids=None, **kwargs): """ Re-pushes all recs that were added since date 'X' to the output (i.e. forwards them onto the Solr queue) :param: since - RFC889 formatted string :type: str :return: no return """ if orcid_ids: for oid in orcid_ids: tasks.task_index_orcid_profile.delay({ 'orcidid': oid, 'force': False }) if not since: print 'Done (just the supplied orcidids)' return logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.repush').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) orcidids = set() logger.info('Re-pushing records since: {0}'.format(from_date.isoformat())) num_bibcodes = 0 with app.session_scope() as session: for rec in session.query(Records) \ .filter(Records.updated >= from_date) \ .order_by(Records.updated.asc()) \ .all(): data = rec.toJSON() try: tasks.task_output_results.delay({ 'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims'] }) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying ', data['bibcode'] tasks.task_output_results.delay({ 'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims'] }) num_bibcodes += 1 with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.repush').first() if kv is None: kv = KeyValue(key='last.repush', value=get_date()) session.add(kv) else: kv.value = get_date() session.commit() logger.info('Done processing {0} orcid ids.'.format(num_bibcodes))
def task_check_orcid_updates(msg): """Check the orcid microservice for updated orcid profiles. This function is somewhat complex we are trying to defend against multiple executions (assuming that there is many workers and each of them can receive its own signal to start processing). Basically, we'll only want to check for updated profiles once. The synchronization is done via a database. So the worker updates the 'last.check' timestamp immediately (and we 'optimistically' hope that it will be enough to prevent clashes; well - even if that is not a strong guarantee, it wouldn't be a tragedy if a profile is checked twice...) Additional difficulty is time synchronization: the worker can be executed as often as you like, but it will refuse to do any work unless the time window between the checks is large enough. """ with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.check').first() if kv is None: kv = KeyValue(key='last.check', value='1974-11-09T22:56:52.518001Z') #force update latest_point = adsputils.get_date(kv.value) # RFC 3339 format now = adsputils.get_date() total_wait = app.conf.get('ORCID_CHECK_FOR_CHANGES', 60 * 5) #default is 5min delta = now - latest_point if delta.total_seconds() < total_wait: # register our own execution in the future task_check_orcid_updates.apply_async( args=(msg, ), countdown=(total_wait - delta.total_seconds()) + 1) else: logger.info("Checking for orcid updates") # increase the timestamp by one microsec and get new updates latest_point = latest_point + datetime.timedelta(microseconds=1) r = app.client.get( app.conf.get('API_ORCID_UPDATES_ENDPOINT') % latest_point.isoformat(), params={'fields': ['orcid_id', 'updated', 'created']}, headers={ 'Authorization': 'Bearer {0}'.format(app.conf.get('API_TOKEN')) }) if r.status_code != 200: logger.error('Failed getting {0}\n{1}'.format( app.conf.get('API_ORCID_UPDATES_ENDPOINT') % kv.value, r.text)) msg['errcount'] = msg.get('errcount', 0) + 1 # schedule future execution offset by number of errors (rca: do exponential?) task_check_orcid_updates.apply_async( args=(msg, ), countdown=total_wait + total_wait * msg['errcount']) return if r.text.strip() == "": return task_check_orcid_updates.apply_async( args=(msg, ), countdown=total_wait) data = r.json() if len(data) == 0: return task_check_orcid_updates.apply_async( args=(msg, ), countdown=total_wait) msg['errcount'] = 0 # success, we got data from the api, reset the counter # we received the data, immediately update the databaes (so that other processes don't # ask for the same starting date) # data should be ordered by date updated (but to be sure, let's check it); we'll save it # as latest 'check point' dates = [adsputils.get_date(x['updated']) for x in data] dates = sorted(dates, reverse=True) kv.value = dates[0].isoformat() session.merge(kv) session.commit() for rec in data: payload = { 'orcidid': rec['orcid_id'], 'start': latest_point.isoformat() } task_index_orcid_profile.delay(payload) # recheck again task_check_orcid_updates.apply_async(args=(msg, ), countdown=total_wait)
def reindex_claims(since=None, **kwargs): """ Re-runs all claims, both from the pipeline and from the orcid-service storage. :param: since - RFC889 formatted string :type: str :return: no return """ logging.captureWarnings(True) if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is not None: since = kv.value else: since = '1974-11-09T22:56:52.518001Z' from_date = get_date(since) orcidids = set() # trigger re-indexing worker = RabbitMQWorker(params={ 'publish': 'ads.orcid.fresh-claims', 'exchange': app.config.get('EXCHANGE', 'ads-orcid') }) worker.connect(app.config.get('RABBITMQ_URL')) logger.info('Loading records since: {0}'.format(from_date.isoformat())) # first re-check our own database (replay the logs) with app.session_scope() as session: for author in session.query(AuthorInfo.orcidid.distinct().label('orcidid')).all(): orcidid = author.orcidid if orcidid and orcidid.strip() != "": try: changed = updater.reindex_all_claims(orcidid, since=from_date.isoformat(), ignore_errors=True) if len(changed): orcidids.add(orcidid) worker.publish({'orcidid': orcidid, 'force': True}) except: print 'Error processing: {0}'.format(orcidid) traceback.print_exc() continue if len(orcidids) % 100 == 0: print 'Done replaying {0} profiles'.format(len(orcidids)) print 'Now harvesting orcid profiles...' # then get all new/old orcidids from orcid-service all_orcids = set(updater.get_all_touched_profiles(from_date.isoformat())) orcidids = all_orcids.difference(orcidids) from_date = get_date() for orcidid in orcidids: try: worker.publish({'orcidid': orcidid, 'force': True}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', orcidid worker.publish({'orcidid': orcidid, 'force': True}) with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.reindex').first() if kv is None: kv = KeyValue(key='last.reindex', value=from_date.isoformat()) session.add(kv) else: kv.value = from_date.isoformat() session.commit() print 'Done' logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))