示例#1
0
文件: run.py 项目: adsabs/ADSOrcid
def refetch_orcidids(since=None, orcid_ids=None, **kwargs):
    """
    Gets all orcidids that were updated since time X.
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    worker = RabbitMQWorker(params={
        'publish': 'ads.orcid.fresh-claims',
        'exchange': app.config.get('EXCHANGE', 'ads-orcid')
    })
    worker.connect(app.config.get('RABBITMQ_URL'))
    if orcid_ids:
        for oid in orcid_ids.split(','):
            worker.publish({'orcidid': oid, 'force': False})
        if not since:
            print 'Done (just the supplied orcidids)'
            return
    
    
    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.refetch').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z' 
    
    from_date = get_date(since)
    
    logger.info('Re-fetching orcidids updated since: {0}'.format(from_date.isoformat()))
    
        
    # then get all new/old orcidids from orcid-service
    orcidids = set(updater.get_all_touched_profiles(from_date.isoformat()))
    from_date = get_date()
    
      
    for orcidid in orcidids:
        try:
            worker.publish({'orcidid': orcidid, 'force': False})
        except: # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', orcidid
            worker.publish({'orcidid': orcidid, 'force': False})
        
    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.refetch').first()
        if kv is None:
            kv = KeyValue(key='last.refetch', value=from_date.isoformat())
            session.add(kv)
        else:
            kv.value = from_date.isoformat()
        session.commit()

    print 'Done'
    logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
示例#2
0
def repush_claims(since=None, **kwargs):
    """
    Re-pushes all recs that were added since date 'X'
    to the output (i.e. forwards them onto the Solr queue)
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.repush').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z' 
    
    from_date = get_date(since)
    orcidids = set()
    
    logger.info('Re-pushing records since: {0}'.format(from_date.isoformat()))
    
    worker = RabbitMQWorker(params={
        'publish': 'ads.orcid.output',
        'exchange': app.config.get('EXCHANGE', 'ads-orcid')
    })
    worker.connect(app.config.get('RABBITMQ_URL'))
    
    num_bibcodes = 0
    with app.session_scope() as session:
        for rec in session.query(Records) \
            .filter(Records.updated >= from_date) \
            .order_by(Records.updated.asc()) \
            .all():
            
            data = rec.toJSON()
            try:
                worker.publish({'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims']})
            except: # potential backpressure (we are too fast)
                time.sleep(2)
                print 'Conn problem, retrying ', data['bibcode']
                worker.publish({'bibcode': data['bibcode'], 'authors': data['authors'], 'claims': data['claims']})
            num_bibcodes += 1
    
    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.repush').first()
        if kv is None:
            kv = KeyValue(key='last.repush', value=get_date())
            session.add(kv)
        else:
            kv.value = get_date()
        session.commit()
        
    logger.info('Done processing {0} orcid ids.'.format(num_bibcodes))
示例#3
0
    def test_get_date(self):
        """Check we always work with UTC dates"""
        
        d = utils.get_date()
        self.assertTrue(d.tzname() == 'UTC')
        
        d1 = utils.get_date('2009-09-04T01:56:35.450686Z')
        self.assertTrue(d1.tzname() == 'UTC')
        self.assertEqual(d1.isoformat(), '2009-09-04T01:56:35.450686+00:00')
        
        d2 = utils.get_date('2009-09-03T20:56:35.450686-05:00')
        self.assertTrue(d2.tzname() == 'UTC')
        self.assertEqual(d2.isoformat(), '2009-09-04T01:56:35.450686+00:00')

        d3 = utils.get_date('2009-09-03T20:56:35.450686')
        self.assertTrue(d3.tzname() == 'UTC')
        self.assertEqual(d3.isoformat(), '2009-09-03T20:56:35.450686+00:00')
示例#4
0
    def test_dates(self):
        '''We want to use only UTC dates'''
        
        with self.assertRaisesRegexp(Exception, 'ValueError'):
            with app.session_scope() as session:
                rec = Records(bibcode='foo', created='2009-09-03T20:56:35.450686Z')
                session.add(rec)
                rec.updated = datetime.now()
                session.commit()

        with app.session_scope() as session:
            rec = Records(bibcode='foo', created='2009-09-03T20:56:35.450686Z')
            session.add(rec)
            rec.updated = get_date()
            session.commit()
示例#5
0
def reindex_claims(since=None, **kwargs):
    """
    Re-runs all claims, both from the pipeline and
    from the orcid-service storage.
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.reindex').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z' 
    
    from_date = get_date(since)
    orcidids = set()
    
    # trigger re-indexing
    worker = RabbitMQWorker(params={
        'publish': 'ads.orcid.fresh-claims',
        'exchange': app.config.get('EXCHANGE', 'ads-orcid')
    })
    worker.connect(app.config.get('RABBITMQ_URL'))
    
    
    logger.info('Loading records since: {0}'.format(from_date.isoformat()))
    
    # first re-check our own database (replay the logs)
    with app.session_scope() as session:
        for author in session.query(AuthorInfo.orcidid.distinct().label('orcidid')).all():
            orcidid = author.orcidid
            if orcidid and orcidid.strip() != "":
                try:
                    changed = updater.reindex_all_claims(orcidid, since=from_date.isoformat(), ignore_errors=True)
                    if len(changed):
                        orcidids.add(orcidid)
                    worker.publish({'orcidid': orcidid, 'force': True})
                except:
                    print 'Error processing: {0}'.format(orcidid)
                    traceback.print_exc()
                    continue
                if len(orcidids) % 100 == 0:
                    print 'Done replaying {0} profiles'.format(len(orcidids))
    
    print 'Now harvesting orcid profiles...'
    
    # then get all new/old orcidids from orcid-service
    all_orcids = set(updater.get_all_touched_profiles(from_date.isoformat()))
    orcidids = all_orcids.difference(orcidids)
    from_date = get_date()
    
      
    for orcidid in orcidids:
        try:
            worker.publish({'orcidid': orcidid, 'force': True})
        except: # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', orcidid
            worker.publish({'orcidid': orcidid, 'force': True})
        
    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.reindex').first()
        if kv is None:
            kv = KeyValue(key='last.reindex', value=from_date.isoformat())
            session.add(kv)
        else:
            kv.value = from_date.isoformat()
        session.commit()

    print 'Done'
    logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))