示例#1
0
文件: run.py 项目: adsabs/ADSOrcid
def refetch_orcidids(since=None, orcid_ids=None, **kwargs):
    """
    Gets all orcidids that were updated since time X.
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    worker = RabbitMQWorker(params={
        'publish': 'ads.orcid.fresh-claims',
        'exchange': app.config.get('EXCHANGE', 'ads-orcid')
    })
    worker.connect(app.config.get('RABBITMQ_URL'))
    if orcid_ids:
        for oid in orcid_ids.split(','):
            worker.publish({'orcidid': oid, 'force': False})
        if not since:
            print 'Done (just the supplied orcidids)'
            return
    
    
    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.refetch').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z' 
    
    from_date = get_date(since)
    
    logger.info('Re-fetching orcidids updated since: {0}'.format(from_date.isoformat()))
    
        
    # then get all new/old orcidids from orcid-service
    orcidids = set(updater.get_all_touched_profiles(from_date.isoformat()))
    from_date = get_date()
    
      
    for orcidid in orcidids:
        try:
            worker.publish({'orcidid': orcidid, 'force': False})
        except: # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', orcidid
            worker.publish({'orcidid': orcidid, 'force': False})
        
    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.refetch').first()
        if kv is None:
            kv = KeyValue(key='last.refetch', value=from_date.isoformat())
            session.add(kv)
        else:
            kv.value = from_date.isoformat()
        session.commit()

    print 'Done'
    logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
示例#2
0
文件: run.py 项目: csgrant00/ADSOrcid
def refetch_orcidids(since=None, orcid_ids=None, **kwargs):
    """
    Gets all orcidids that were updated since time X.
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    if orcid_ids:
        for oid in orcid_ids:
            tasks.task_index_orcid_profile({'orcidid': oid, 'force': False})
        if not since:
            print 'Done (just the supplied orcidids)'
            return

    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.refetch').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z'

    from_date = get_date(since)
    logger.info('Re-fetching orcidids updated since: {0}'.format(
        from_date.isoformat()))

    # then get all new/old orcidids from orcid-service
    orcidids = set(updater.get_all_touched_profiles(app,
                                                    from_date.isoformat()))
    from_date = get_date()

    for orcidid in orcidids:
        try:
            tasks.task_index_orcid_profile.delay({
                'orcidid': orcidid,
                'force': False
            })
        except:  # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', orcidid
            tasks.task_index_orcid_profile.delay({
                'orcidid': orcidid,
                'force': False
            })

    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.refetch').first()
        if kv is None:
            kv = KeyValue(key='last.refetch', value=from_date.isoformat())
            session.add(kv)
        else:
            kv.value = from_date.isoformat()
        session.commit()

    print 'Done'
    logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
示例#3
0
def repush_claims(since=None, **kwargs):
    """
    Re-pushes all recs that were added since date 'X'
    to the output (i.e. forwards them onto the Solr queue)
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.repush').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z' 
    
    from_date = get_date(since)
    orcidids = set()
    
    logger.info('Re-pushing records since: {0}'.format(from_date.isoformat()))
    
    worker = RabbitMQWorker(params={
        'publish': 'ads.orcid.output',
        'exchange': app.config.get('EXCHANGE', 'ads-orcid')
    })
    worker.connect(app.config.get('RABBITMQ_URL'))
    
    num_bibcodes = 0
    with app.session_scope() as session:
        for rec in session.query(Records) \
            .filter(Records.updated >= from_date) \
            .order_by(Records.updated.asc()) \
            .all():
            
            data = rec.toJSON()
            try:
                worker.publish({'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims']})
            except: # potential backpressure (we are too fast)
                time.sleep(2)
                print 'Conn problem, retrying ', data['bibcode']
                worker.publish({'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims']})
            num_bibcodes += 1
    
    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.repush').first()
        if kv is None:
            kv = KeyValue(key='last.repush', value=get_date())
            session.add(kv)
        else:
            kv.value = get_date()
        session.commit()
        
    logger.info('Done processing {0} orcid ids.'.format(num_bibcodes))
示例#4
0
文件: run.py 项目: csgrant00/ADSOrcid
def reindex_claims(since=None, orcid_ids=None, **kwargs):
    """
    Re-runs all claims, both from the pipeline and
    from the orcid-service storage.
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    if orcid_ids:
        for oid in orcid_ids:
            tasks.task_index_orcid_profile.delay({
                'orcidid': oid,
                'force': True
            })
        if not since:
            print 'Done (just the supplied orcidids)'
            return

    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.reindex').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z'

    from_date = get_date(since)
    orcidids = set()

    logger.info('Loading records since: {0}'.format(from_date.isoformat()))

    # first re-check our own database (replay the logs)
    with app.session_scope() as session:
        for author in session.query(
                AuthorInfo.orcidid.distinct().label('orcidid')).all():
            orcidid = author.orcidid
            if orcidid and orcidid.strip() != "":
                try:
                    changed = updater.reindex_all_claims(
                        app,
                        orcidid,
                        since=from_date.isoformat(),
                        ignore_errors=True)
                    if len(changed):
                        orcidids.add(orcidid)
                    tasks.task_index_orcid_profile.delay({
                        'orcidid': orcidid,
                        'force': True
                    })
                except:
                    print 'Error processing: {0}'.format(orcidid)
                    traceback.print_exc()
                    continue
                if len(orcidids) % 100 == 0:
                    print 'Done replaying {0} profiles'.format(len(orcidids))

    print 'Now harvesting orcid profiles...'

    # then get all new/old orcidids from orcid-service
    all_orcids = set(
        updater.get_all_touched_profiles(app, from_date.isoformat()))
    orcidids = all_orcids.difference(orcidids)
    from_date = get_date()

    for orcidid in orcidids:
        try:
            tasks.task_index_orcid_profile.delay({
                'orcidid': orcidid,
                'force': True
            })
        except:  # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', orcidid
            tasks.task_index_orcid_profile.delay({
                'orcidid': orcidid,
                'force': True
            })

    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.reindex').first()
        if kv is None:
            kv = KeyValue(key='last.reindex', value=from_date.isoformat())
            session.add(kv)
        else:
            kv.value = from_date.isoformat()
        session.commit()

    print 'Done'
    logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
示例#5
0
文件: run.py 项目: csgrant00/ADSOrcid
def repush_claims(since=None, orcid_ids=None, **kwargs):
    """
    Re-pushes all recs that were added since date 'X'
    to the output (i.e. forwards them onto the Solr queue)
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    if orcid_ids:
        for oid in orcid_ids:
            tasks.task_index_orcid_profile.delay({
                'orcidid': oid,
                'force': False
            })
        if not since:
            print 'Done (just the supplied orcidids)'
            return

    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.repush').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z'

    from_date = get_date(since)
    orcidids = set()

    logger.info('Re-pushing records since: {0}'.format(from_date.isoformat()))

    num_bibcodes = 0
    with app.session_scope() as session:
        for rec in session.query(Records) \
            .filter(Records.updated >= from_date) \
            .order_by(Records.updated.asc()) \
            .all():

            data = rec.toJSON()
            try:
                tasks.task_output_results.delay({
                    'bibcode': data['bibcode'],
                    'authors': data['authors'],
                    'claims': data['claims']
                })
            except:  # potential backpressure (we are too fast)
                time.sleep(2)
                print 'Conn problem, retrying ', data['bibcode']
                tasks.task_output_results.delay({
                    'bibcode': data['bibcode'],
                    'authors': data['authors'],
                    'claims': data['claims']
                })
            num_bibcodes += 1

    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.repush').first()
        if kv is None:
            kv = KeyValue(key='last.repush', value=get_date())
            session.add(kv)
        else:
            kv.value = get_date()
        session.commit()

    logger.info('Done processing {0} orcid ids.'.format(num_bibcodes))
示例#6
0
def task_check_orcid_updates(msg):
    """Check the orcid microservice for updated orcid profiles.

    This function is somewhat complex
    we are trying to defend against multiple executions (assuming
    that there is many workers and each of them can receive its own
    signal to start processing).

    Basically, we'll only want to check for updated profiles once.
    The synchronization is done via a database. So the worker
    updates the 'last.check' timestamp immediately (and we
    'optimistically' hope that it will be enough to prevent clashes;
    well - even if that is not a strong guarantee, it wouldn't be
    a tragedy if a profile is checked twice...)

    Additional difficulty is time synchronization: the worker can
    be executed as often as you like, but it will refuse to do any
    work unless the time window between the checks is large enough.
    """

    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.check').first()
        if kv is None:
            kv = KeyValue(key='last.check',
                          value='1974-11-09T22:56:52.518001Z')  #force update

        latest_point = adsputils.get_date(kv.value)  # RFC 3339 format
        now = adsputils.get_date()

        total_wait = app.conf.get('ORCID_CHECK_FOR_CHANGES',
                                  60 * 5)  #default is 5min
        delta = now - latest_point

        if delta.total_seconds() < total_wait:
            # register our own execution in the future
            task_check_orcid_updates.apply_async(
                args=(msg, ),
                countdown=(total_wait - delta.total_seconds()) + 1)
        else:
            logger.info("Checking for orcid updates")

            # increase the timestamp by one microsec and get new updates
            latest_point = latest_point + datetime.timedelta(microseconds=1)
            r = app.client.get(
                app.conf.get('API_ORCID_UPDATES_ENDPOINT') %
                latest_point.isoformat(),
                params={'fields': ['orcid_id', 'updated', 'created']},
                headers={
                    'Authorization':
                    'Bearer {0}'.format(app.conf.get('API_TOKEN'))
                })

            if r.status_code != 200:
                logger.error('Failed getting {0}\n{1}'.format(
                    app.conf.get('API_ORCID_UPDATES_ENDPOINT') % kv.value,
                    r.text))
                msg['errcount'] = msg.get('errcount', 0) + 1

                # schedule future execution offset by number of errors (rca: do exponential?)
                task_check_orcid_updates.apply_async(
                    args=(msg, ),
                    countdown=total_wait + total_wait * msg['errcount'])
                return

            if r.text.strip() == "":
                return task_check_orcid_updates.apply_async(
                    args=(msg, ), countdown=total_wait)

            data = r.json()

            if len(data) == 0:
                return task_check_orcid_updates.apply_async(
                    args=(msg, ), countdown=total_wait)

            msg['errcount'] = 0  # success, we got data from the api, reset the counter

            # we received the data, immediately update the databaes (so that other processes don't
            # ask for the same starting date)
            # data should be ordered by date updated (but to be sure, let's check it); we'll save it
            # as latest 'check point'
            dates = [adsputils.get_date(x['updated']) for x in data]
            dates = sorted(dates, reverse=True)

            kv.value = dates[0].isoformat()
            session.merge(kv)
            session.commit()

            for rec in data:
                payload = {
                    'orcidid': rec['orcid_id'],
                    'start': latest_point.isoformat()
                }
                task_index_orcid_profile.delay(payload)

            # recheck again
            task_check_orcid_updates.apply_async(args=(msg, ),
                                                 countdown=total_wait)
示例#7
0
def reindex_claims(since=None, **kwargs):
    """
    Re-runs all claims, both from the pipeline and
    from the orcid-service storage.
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.reindex').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z' 
    
    from_date = get_date(since)
    orcidids = set()
    
    # trigger re-indexing
    worker = RabbitMQWorker(params={
        'publish': 'ads.orcid.fresh-claims',
        'exchange': app.config.get('EXCHANGE', 'ads-orcid')
    })
    worker.connect(app.config.get('RABBITMQ_URL'))
    
    
    logger.info('Loading records since: {0}'.format(from_date.isoformat()))
    
    # first re-check our own database (replay the logs)
    with app.session_scope() as session:
        for author in session.query(AuthorInfo.orcidid.distinct().label('orcidid')).all():
            orcidid = author.orcidid
            if orcidid and orcidid.strip() != "":
                try:
                    changed = updater.reindex_all_claims(orcidid, since=from_date.isoformat(), ignore_errors=True)
                    if len(changed):
                        orcidids.add(orcidid)
                    worker.publish({'orcidid': orcidid, 'force': True})
                except:
                    print 'Error processing: {0}'.format(orcidid)
                    traceback.print_exc()
                    continue
                if len(orcidids) % 100 == 0:
                    print 'Done replaying {0} profiles'.format(len(orcidids))
    
    print 'Now harvesting orcid profiles...'
    
    # then get all new/old orcidids from orcid-service
    all_orcids = set(updater.get_all_touched_profiles(from_date.isoformat()))
    orcidids = all_orcids.difference(orcidids)
    from_date = get_date()
    
      
    for orcidid in orcidids:
        try:
            worker.publish({'orcidid': orcidid, 'force': True})
        except: # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', orcidid
            worker.publish({'orcidid': orcidid, 'force': True})
        
    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.reindex').first()
        if kv is None:
            kv = KeyValue(key='last.reindex', value=from_date.isoformat())
            session.add(kv)
        else:
            kv.value = from_date.isoformat()
        session.commit()

    print 'Done'
    logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))