Exemplo n.º 1
0
 def update_processed_timestamp(self, bibcode):
     with self.session_scope() as session:
         r = session.query(Records).filter_by(bibcode=bibcode).first()
         if r is None:
             raise Exception('Cant find bibcode {0} to update timestamp'.format(bibcode))
         r.processed = get_date()
         session.commit()
Exemplo n.º 2
0
 def toJSON(self):
     return {'id': self.id, 
             'key': self.key,
             'created': self.created and get_date(self.created).isoformat() or None,
             'newvalue': self.newvalue,
             'oldvalue': self.oldvalue
             }
Exemplo n.º 3
0
 def toJSON(self, for_solr=False, load_only=None):
     if for_solr:
         return self
     else:
         load_only = load_only and set(load_only) or set()
         doc = {}
         
         for f in Records._text_fields:
             if load_only and f not in load_only:
                 continue
             doc[f] = getattr(self, f, None)
         for f in Records._date_fields:
             if load_only and f not in load_only:
                 continue
             if hasattr(self, f) and getattr(self, f):
                 doc[f] = get_date(getattr(self, f))
             else:
                 doc[f] = None
         for f in Records._json_fields: # json
             if load_only and f not in load_only:
                 continue
             v = getattr(self, f, None)
             if v:
                 v = json.loads(v)
             doc[f] = v
             
         return doc
Exemplo n.º 4
0
    def update_storage(self, bibcode, **kwargs):
        """Update database record; you can pass in the kwargs
        the payload; only 'data' and fingerprint are considered
        payload. The record will be created if the bibcode is
        seen the first time.
        @param bibcode: bibcode
        @keyword kwargs: dictionary with payload, keys correspond
            to the `Records` attribute
        @return: JSON representation of the record
        """
        with self.session_scope() as session:
            r = session.query(Records).filter_by(bibcode=bibcode).first()
            updated = False
            if r is None:
                r = Records(bibcode=bibcode)
                session.add(r)
            now = get_date()
        
            for k, v in kwargs.items():
                if k == 'fingerprint':
                    r.__setattr__(k, v)
                elif '_data' in k and hasattr(r, k):
                    colname, _ = k.split('_')
                    r.__setattr__(k, v)
                    if r.__getattr__('{}_created').format(colname) is None:
                        r.__setattr__('{}_created'.format(colname), now)
                    r.__setattr__('{}_updated'.format(colname), now)
                    updated = True
                elif k == 'origin':
                    r.origin = v
                    if v == 'direct':
                        # if bibcode was already deleted it can not be added by direct
                        d = session.query(ChangeLog) \
                                   .filter_by(oldvalue=bibcode) \
                                   .filter_by(key='deleted') \
                                   .first()
                        if d:
                            # just abort, do not update storage
                            self.logger.warn('direct tried to overwrite deleted bibcode %s' % bibcode)
                            session.rollback()
                            return None
                        if r.direct_created is None:
                            r.direct_created = now
                        r.direct_updated = now
                    updated = True
                    
            if updated:
                r.updated = now
                session.commit()

            return r.toJSON()
Exemplo n.º 5
0
 def process_bind_param(self, value, engine):
     if isinstance(value, basestring):
         return get_date(value).astimezone(tzutc())
     elif value is not None:
         return value.astimezone(
             tzutc())  # will raise Error is not datetime
Exemplo n.º 6
0
    def test_task_process_myads(self):
        msg = {'frequency': 'daily'}

        # can't process without a user ID
        with patch.object(tasks.logger, 'error', return_value=None) as logger:
            tasks.task_process_myads(msg)
            logger.assert_called_with(
                u"No user ID received for {0}".format(msg))

        msg = {'userid': 123}
        self._httpretty_mock_myads_setup(msg)

        # can't process without a frequency
        with patch.object(tasks.logger, 'error', return_value=None) as logger:
            tasks.task_process_myads(msg)
            logger.assert_called_with(
                u"No frequency received for {0}".format(msg))

        # process a user (the user should get created during the task)
        msg = {'userid': 123, 'frequency': 'daily'}

        uri = self.app.conf[
            'API_SOLR_QUERY_ENDPOINT'] + '?q={query}&sort={sort}&fl={fields}&rows={rows}'.format(
                query=quote_plus(
                    'title:"gravity waves" entdate:[2019-08-03 TO 2019-08-04] bibstem:"arxiv"'
                ),
                sort=quote_plus('score desc, bibcode desc'),
                fields='bibcode,title,author_norm,identifier,year,bibstem',
                rows=2000)
        httpretty.register_uri(
            httpretty.GET,
            uri,
            content_type='application/json',
            status=200,
            body=json.dumps({
                'response': {
                    'docs': [{
                        'bibcode':
                        '2019arXiv190800829P',
                        'title': [
                            'Gravitational wave signatures from an extended ' +
                            'inert doublet dark matter model'
                        ],
                        'author_norm':
                        ['Paul, A', 'Banerjee, B', 'Majumdar, D'],
                        "identifier":
                        ["2019arXiv190800829P", "arXiv:1908.00829"],
                        "year":
                        "2019",
                        "bibstem": ["arXiv"]
                    }, {
                        'bibcode':
                        '2019arXiv190800678L',
                        'title': [
                            'Prospects for Gravitational Wave Measurement ' +
                            'of ZTFJ1539+5027'
                        ],
                        'author_norm': ['Littenberg, T', 'Cornish, N'],
                        "identifier":
                        ["2019arXiv190800678L", "arXiv:1908.00678"],
                        "year":
                        "2019",
                        "bibstem": ["arXiv"]
                    }],
                    'numFound':
                    2,
                    'start':
                    0
                },
                'responseHeader': {
                    'QTime': 5,
                    'params': {
                        'fl':
                        'bibcode,title,author_norm,identifier,year,bibstem',
                        'q':
                        'title:"gravity waves" ' +
                        'entdate:[2019-08-03 TO 2019-08-04] bibstem:"arxiv"',
                        'rows':
                        '2000',
                        'start':
                        '0',
                        'wt':
                        'json',
                        'x-amzn-trace-id':
                        'Root=1-5d3b6518-3b417bec5eee25783a4147f4'
                    },
                    'status': 0
                }
            }))

        uri = self.app.conf[
            'API_SOLR_QUERY_ENDPOINT'] + '?q={query}&sort={sort}&fl={fields}&rows={rows}'.format(
                query=quote_plus(
                    'bibstem:arxiv (arxiv_class:(astro-ph.*) (star)) entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]'
                ),
                sort=quote_plus('score desc, bibcode desc'),
                fields='bibcode,title,author_norm,identifier,year,bibstem',
                rows=2000)
        httpretty.register_uri(httpretty.GET,
                               uri,
                               content_type='application/json',
                               status=401)

        with patch.object(self.app, 'get_recent_results') as get_recent_results, \
            patch.object(utils, 'get_user_email') as get_user_email, \
            patch.object(utils, 'payload_to_plain') as payload_to_plain, \
            patch.object(utils, 'payload_to_html') as payload_to_html, \
            patch.object(utils, 'send_email') as send_email, \
            patch.object(tasks.task_process_myads, 'apply_async') as rerun_task:

            get_recent_results.return_value = [
                '2019arXiv190800829P', '2019arXiv190800678L'
            ]
            get_user_email.return_value = '*****@*****.**'
            payload_to_plain.return_value = 'plain payload'
            payload_to_html.return_value = '<em>html payload</em>'
            send_email.return_value = 'this should be a MIMEMultipart object'

            # non-200 from solr, so task should get tried again
            tasks.task_process_myads(msg)
            self.assertTrue(rerun_task.called)

            # Reset httpretty, otherwise there will be two identical registered
            # URIs except that one returns 401 and the other 200
            httpretty.reset()
            self._httpretty_mock_myads_setup(msg)
            uri = self.app.conf[
                'API_SOLR_QUERY_ENDPOINT'] + '?q={query}&sort={sort}&fl={fields}&rows={rows}'.format(
                    query=quote_plus(
                        'bibstem:arxiv (arxiv_class:(astro-ph.*) (star)) entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]'
                    ),
                    sort=quote_plus('score desc, bibcode desc'),
                    fields='bibcode,title,author_norm,identifier,year,bibstem',
                    rows=2000)
            httpretty.register_uri(
                httpretty.GET,
                uri,
                content_type='application/json',
                status=200,
                body=json.dumps({
                    "responseHeader": {
                        "status": 0,
                        "QTime": 23,
                        "params": {
                            "q":
                            'bibstem:arxiv (arxiv_class:(astro-ph.*) (star)) entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]',
                            "x-amzn-trace-id":
                            "Root=1-5d769c6c-f96bfa49d348f03d8ecb7464",
                            "fl": "bibcode,title,author_norm",
                            "start": "0",
                            "sort": "score desc, bibcode desc",
                            "rows": "2000",
                            "wt": "json"
                        }
                    },
                    "response": {
                        "numFound":
                        2712,
                        "start":
                        0,
                        "docs": [{
                            "bibcode":
                            "1971JVST....8..324K",
                            "title": ["High-Capacity Lead Tin Barrel Dome..."],
                            "author_norm": ["Kurtz, J"],
                            "identifier": ["1971JVST....8..324K"],
                            "year":
                            "1971",
                            "bibstem": ["JVST"]
                        }, {
                            "bibcode":
                            "1972ApJ...178..701K",
                            "title": [
                                "Search for Coronal Line Emission from the Cygnus Loop"
                            ],
                            "author_norm":
                            ["Kurtz, D", "Vanden Bout, P", "Angel, J"],
                            "identifier": ["1972ApJ...178..701K"],
                            "year":
                            "1972",
                            "bibstem": ["ApJ"]
                        }, {
                            "bibcode":
                            "1973ApOpt..12..891K",
                            "title":
                            ["Author's Reply to Comments on: Experimental..."],
                            "author_norm": ["Kurtz, R"],
                            "identifier": ["1973ApOpt..12..891K"],
                            "year":
                            "1973",
                            "bibstem": ["ApOpt"]
                        }, {
                            "bibcode":
                            "1973SSASJ..37..725W",
                            "title":
                            ["Priming Effect of 15N-Labeled Fertilizers..."],
                            "author_norm": ["Westerman, R", "Kurtz, L"],
                            "identifier": ["1973SSASJ..37..725W"],
                            "year":
                            "1973",
                            "bibstem": ["SSASJ"]
                        }, {
                            "bibcode":
                            "1965JSpRo...2..818K",
                            "title": [
                                "Orbital tracking and decay analysis of the saturn..."
                            ],
                            "author_norm":
                            ["Kurtz, H", "McNair, A", "Naumcheff, M"],
                            "identifier": ["1965JSpRo...2..818K"],
                            "year":
                            "1965",
                            "bibstem": ["JSpRo"]
                        }]
                    }
                }))
            uri = self.app.conf[
                'API_SOLR_QUERY_ENDPOINT'] + '?q={query}&sort={sort}&fl={fields}&rows={rows}'.format(
                    query=quote_plus(
                        'bibstem:arxiv (arxiv_class:(astro-ph.*) NOT (star)) entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]'
                    ),
                    sort=quote_plus('score desc, bibcode desc'),
                    fields='bibcode,title,author_norm',
                    rows=2000)
            httpretty.register_uri(
                httpretty.GET,
                uri,
                content_type='application/json',
                status=200,
                body=json.dumps({
                    "responseHeader": {
                        "status": 0,
                        "QTime": 23,
                        "params": {
                            "q":
                            'bibstem:arxiv (arxiv_class:(astro-ph.*) NOT (star)) entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]',
                            "x-amzn-trace-id":
                            "Root=1-5d769c6c-f96bfa49d348f03d8ecb7464",
                            "fl": "bibcode,title,author_norm",
                            "start": "0",
                            "sort": "score desc, bibcode desc",
                            "rows": "2000",
                            "wt": "json"
                        }
                    },
                    "response": {
                        "numFound":
                        2712,
                        "start":
                        0,
                        "docs": [{
                            "bibcode":
                            "1971JVST....8..324K",
                            "title": ["High-Capacity Lead Tin Barrel Dome..."],
                            "author_norm": ["Kurtz, J"],
                            "identifier": ["1971JVST....8..324K"],
                            "year":
                            "1971",
                            "bibstem": ["JVST"]
                        }, {
                            "bibcode":
                            "1972ApJ...178..701K",
                            "title": [
                                "Search for Coronal Line Emission from the Cygnus Loop"
                            ],
                            "author_norm":
                            ["Kurtz, D", "Vanden Bout, P", "Angel, J"],
                            "identifier": ["1972ApJ...178..701K"],
                            "year":
                            "1972",
                            "bibstem": ["ApJ"]
                        }, {
                            "bibcode":
                            "1973ApOpt..12..891K",
                            "title":
                            ["Author's Reply to Comments on: Experimental..."],
                            "author_norm": ["Kurtz, R"],
                            "identifier": ["1973ApOpt..12..891K"],
                            "year":
                            "1973",
                            "bibstem": ["ApOpt"]
                        }, {
                            "bibcode":
                            "1973SSASJ..37..725W",
                            "title":
                            ["Priming Effect of 15N-Labeled Fertilizers..."],
                            "author_norm": ["Westerman, R", "Kurtz, L"],
                            "identifier": ["1973SSASJ..37..725W"],
                            "year":
                            "1973",
                            "bibstem": ["SSASJ"]
                        }, {
                            "bibcode":
                            "1965JSpRo...2..818K",
                            "title": [
                                "Orbital tracking and decay analysis of the saturn..."
                            ],
                            "author_norm":
                            ["Kurtz, H", "McNair, A", "Naumcheff, M"],
                            "identifier": ["1965JSpRo...2..818K"],
                            "year":
                            "1965",
                            "bibstem": ["JSpRo"]
                        }]
                    }
                }))

            tasks.task_process_myads(msg)
            with self.app.session_scope() as session:
                user = session.query(AuthorInfo).filter_by(id=123).first()
                self.assertEqual(adsputils.get_date().date(),
                                 user.last_sent_daily.date())

        msg = {'userid': 123, 'frequency': 'daily', 'force': False}

        uri = self.app.conf[
            'API_SOLR_QUERY_ENDPOINT'] + '?q={query}&sort={sort}&fl={fields}&rows={rows}'.format(
                query=quote_plus(
                    'author:Kurtz entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]'
                ),
                sort=quote_plus('score desc, bibcode desc'),
                fields='bibcode,title,author_norm',
                rows=5)
        httpretty.register_uri(
            httpretty.GET,
            uri,
            content_type='application/json',
            status=200,
            body=json.dumps({
                "responseHeader": {
                    "status": 0,
                    "QTime": 23,
                    "params": {
                        "q":
                        'author:Kurtz entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]',
                        "x-amzn-trace-id":
                        "Root=1-5d769c6c-f96bfa49d348f03d8ecb7464",
                        "fl": "bibcode,title,author_norm",
                        "start": "0",
                        "sort": "score desc, bibcode desc",
                        "rows": "5",
                        "wt": "json"
                    }
                },
                "response": {
                    "numFound":
                    2712,
                    "start":
                    0,
                    "docs": [{
                        "bibcode": "1971JVST....8..324K",
                        "title": ["High-Capacity Lead Tin Barrel Dome..."],
                        "author_norm": ["Kurtz, J"],
                        "identifier": ["1971JVST....8..324K"],
                        "year": "1971",
                        "bibstem": ["JVST"]
                    }, {
                        "bibcode":
                        "1972ApJ...178..701K",
                        "title": [
                            "Search for Coronal Line Emission from the Cygnus Loop"
                        ],
                        "author_norm":
                        ["Kurtz, D", "Vanden Bout, P", "Angel, J"],
                        "identifier": ["1972ApJ...178..701K"],
                        "year":
                        "1972",
                        "bibstem": ["ApJ"]
                    }, {
                        "bibcode":
                        "1973ApOpt..12..891K",
                        "title":
                        ["Author's Reply to Comments on: Experimental..."],
                        "author_norm": ["Kurtz, R"],
                        "identifier": ["1973ApOpt..12..891K"],
                        "year":
                        "1973",
                        "bibstem": ["ApOpt"]
                    }, {
                        "bibcode":
                        "1973SSASJ..37..725W",
                        "title":
                        ["Priming Effect of 15N-Labeled Fertilizers..."],
                        "author_norm": ["Westerman, R", "Kurtz, L"],
                        "identifier": ["1973SSASJ..37..725W"],
                        "year":
                        "1973",
                        "bibstem": ["SSASJ"]
                    }, {
                        "bibcode":
                        "1965JSpRo...2..818K",
                        "title": [
                            "Orbital tracking and decay analysis of the saturn..."
                        ],
                        "author_norm":
                        ["Kurtz, H", "McNair, A", "Naumcheff, M"],
                        "identifier": ["1965JSpRo...2..818K"],
                        "year":
                        "1965",
                        "bibstem": ["JSpRo"]
                    }]
                }
            }))

        with patch.object(self.app, 'get_recent_results') as get_recent_results, \
            patch.object(utils, 'get_user_email') as get_user_email, \
            patch.object(utils, 'payload_to_plain') as payload_to_plain, \
            patch.object(utils, 'payload_to_html') as payload_to_html, \
            patch.object(utils, 'send_email') as send_email:

            get_recent_results.return_value = [
                '2019arXiv190800829P', '2019arXiv190800678L'
            ]
            get_user_email.return_value = '*****@*****.**'
            payload_to_plain.return_value = 'plain payload'
            payload_to_html.return_value = '<em>html payload</em>'
            send_email.return_value = 'this should be a MIMEMultipart object'

            # already ran today, tried to run again without force=True
            with patch.object(tasks.logger, 'warning',
                              return_value=None) as logger:
                tasks.task_process_myads(msg)
                logger.assert_called_with(
                    u"Email for user {0} already sent today".format(
                        msg['userid']))

            msg = {'userid': 123, 'frequency': 'weekly'}

            # reset user
            with self.app.session_scope() as session:
                user = session.query(AuthorInfo).filter_by(id=123).first()
                user.last_sent_daily = None
                user.last_sent_weekly = None
                session.add(user)
                session.commit()

            with self.app.session_scope() as session:
                user = session.query(AuthorInfo).filter_by(id=123).first()
                self.assertIsNone(user.last_sent_daily)
                self.assertIsNone(user.last_sent_weekly)

            tasks.task_process_myads(msg)

            with self.app.session_scope() as session:
                user = session.query(AuthorInfo).filter_by(id=123).first()
                self.assertEqual(adsputils.get_date().date(),
                                 user.last_sent_weekly.date())
                self.assertIsNone(user.last_sent_daily)

            # postdate the weekly last_sent date
            with self.app.session_scope() as session:
                user = session.query(AuthorInfo).filter_by(id=123).first()
                user.last_sent_weekly = user.last_sent_weekly - datetime.timedelta(
                    days=30)
                last_sent_weekly = user.last_sent_weekly
                session.add(user)
                session.commit()

            start_date = last_sent_weekly + datetime.timedelta(days=1)
            uri = self.app.conf['API_VAULT_MYADS_SETUP_DATE'] % (msg['userid'],
                                                                 start_date)
            httpretty.register_uri(
                httpretty.GET,
                uri,
                content_type='application/json',
                status=200,
                body=json.dumps([{
                    'id':
                    1,
                    'name':
                    'Query 1',
                    'qid':
                    '1234567890abcdefghijklmnopqrstu1',
                    'active':
                    True,
                    'stateful':
                    True,
                    'frequency':
                    'daily',
                    'type':
                    'query',
                    'template':
                    None,
                    'query': [{
                        'q': 'title:"gravity waves" ' +
                        'entdate:[2019-08-03 TO 2019-08-04] bibstem:"arxiv"',
                        'sort': 'score desc, bibcode desc'
                    }]
                }, {
                    'id':
                    2,
                    'name':
                    'Query 2',
                    'qid':
                    None,
                    'active':
                    True,
                    'stateful':
                    False,
                    'frequency':
                    'weekly',
                    'type':
                    'template',
                    'template':
                    'authors',
                    'data': {
                        'data': 'author:Kurtz'
                    },
                    'query': [{
                        'q':
                        'author:Kurtz entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]',
                        'sort': 'score desc, bibcode desc'
                    }]
                }, {
                    'id':
                    3,
                    'name':
                    'Query 3',
                    'qid':
                    None,
                    'active':
                    True,
                    'stateful':
                    True,
                    'frequency':
                    'daily',
                    'type':
                    'template',
                    'template':
                    'arxiv',
                    'data':
                    'star',
                    'classes': ['astro-ph'],
                    'query': [{
                        'q':
                        'bibstem:arxiv (arxiv_class:(astro-ph.*) (star)) '
                        'entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]',
                        'sort':
                        'score desc, bibcode desc'
                    }, {
                        'q':
                        'bibstem:arxiv (arxiv_class:(astro-ph.*) NOT (star)) '
                        'entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]',
                        'sort':
                        'bibcode desc'
                    }]
                }]))

            tasks.task_process_myads(msg)

            with self.app.session_scope() as session:
                user = session.query(AuthorInfo).filter_by(id=123).first()
                self.assertEqual(adsputils.get_date().date(),
                                 user.last_sent_weekly.date())
                self.assertIsNone(user.last_sent_daily)

            # check that email with no results isn't sent
            uri = self.app.conf[
                'API_SOLR_QUERY_ENDPOINT'] + '?q={query}&sort={sort}&fl={fields}&rows={rows}'.format(
                    query=quote_plus(
                        'author:Kurtz entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]'
                    ),
                    sort=quote_plus('score desc, bibcode desc'),
                    fields='bibcode,title,author_norm',
                    rows=5)
            httpretty.register_uri(
                httpretty.GET,
                uri,
                content_type='application/json',
                status=200,
                body=json.dumps({
                    "responseHeader": {
                        "status": 0,
                        "QTime": 23,
                        "params": {
                            "q":
                            'author:Kurtz entdate:["2020-01-01Z00:00" TO "2020-01-01Z23:59"] pubdate:[2019-00 TO *]',
                            "x-amzn-trace-id":
                            "Root=1-5d769c6c-f96bfa49d348f03d8ecb7464",
                            "fl": "bibcode,title,author_norm",
                            "start": "0",
                            "sort": "score desc, bibcode desc",
                            "rows": "5",
                            "wt": "json"
                        }
                    },
                    "response": {
                        "numFound": 2712,
                        "start": 0,
                        "docs": []
                    }
                }))

            msg = {'userid': 123, 'frequency': 'weekly'}

            # reset user
            with self.app.session_scope() as session:
                user = session.query(AuthorInfo).filter_by(id=123).first()
                user.last_sent_daily = None
                user.last_sent_weekly = None
                session.add(user)
                session.commit()

            with self.app.session_scope() as session:
                user = session.query(AuthorInfo).filter_by(id=123).first()
                self.assertIsNone(user.last_sent_daily)
                self.assertIsNone(user.last_sent_weekly)

            with patch.object(tasks.logger, 'info',
                              return_value=None) as logger:
                tasks.task_process_myads(msg)
                logger.assert_called_with(
                    u"No payload for user {0} for the {1} email. No email was sent."
                    .format(msg['userid'], msg['frequency']))
Exemplo n.º 7
0
    def test_task_index_orcid_profile(self):

        with patch.object(self.app, 'retrieve_orcid') as retrieve_orcid, \
            patch.object(tasks.requests, 'get') as get, \
            patch.object(self.app, 'get_claims') as get_claims, \
            patch.object(self.app, 'insert_claims') as insert_claims, \
            patch.object(tasks.task_index_orcid_profile, 'apply_async') as task_index_orcid_profile, \
            patch.object(tasks.task_match_claim, 'delay') as next_task:

            r = PropertyMock()
            data = {
                'bibcode': {
                    'status': 'some status',
                    'title': 'some title'
                }
            }
            r.text = str(data)
            r.json = lambda: data
            r.status_code = 200
            get.return_value = r

            get_claims.return_value = (
                {
                    'bibcode1':
                    ('Bibcode1', utils.get_date('2017-01-01'), 'provenance',
                     ['id1', 'id2'], ['Stern, D K', 'author two']),
                    'bibcode2':
                    ('Bibcode2', utils.get_date('2017-01-01'), 'provenance',
                     ['id1', 'id2'], ['author one', 'Stern, D K']),
                    'bibcode3':
                    ('Bibcode3', utils.get_date('2017-01-01'), 'provenance',
                     ['id1', 'id2'], ['Stern, D K', 'author two']),
                },
                {
                    'bibcode1': ('Bibcode1', utils.get_date('2017-01-01')),
                    'bibcode4': ('Bibcode4', utils.get_date('2017-01-01')
                                 ),  # we have, but orcid no more
                },
                {
                    'bibcode2': ('Bibcode2', utils.get_date('2017-01-01')),
                })
            insert_claims.return_value = [
                {
                    'status': u'#full-import',
                    'bibcode': u'',
                    'created': '2017-05-26T21:29:22.726506+00:00',
                    'provenance': u'OrcidImporter',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
                {
                    'status': u'claimed',
                    'bibcode': 'Bibcode2',
                    'created': '2017-01-01T00:00:00+00:00',
                    'provenance': u'provenance',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
                {
                    'status': u'claimed',
                    'bibcode': 'Bibcode3',
                    'created': '2017-01-01T00:00:00+00:00',
                    'provenance': u'provenance',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
                {
                    'status': u'removed',
                    'bibcode': 'Bibcode4',
                    'created': '2017-05-26T21:29:22.728368+00:00',
                    'provenance': u'OrcidImporter',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
                {
                    'status': u'unchanged',
                    'bibcode': 'Bibcode1',
                    'created': '2017-01-01T00:00:00+00:00',
                    'provenance': u'OrcidImporter',
                    'orcidid': '0000-0003-3041-2092',
                    'id': None
                },
            ]

            self.assertFalse(next_task.called)

            # check authors can be skipped
            retrieve_orcid.return_value = {
                'status': 'blacklisted',
                'name': u'Stern, D K',
                'facts': {
                    u'author': [u'Stern, D', u'Stern, D K', u'Stern, Daniel'],
                    u'orcid_name': [u'Stern, Daniel'],
                    u'author_norm': [u'Stern, D'],
                    u'name': u'Stern, D K'
                },
                'orcidid': u'0000-0003-2686-9241',
                'id': 1,
                'account_id': None,
                'updated': utils.get_date('2017-01-01')
            }

            tasks.task_index_orcid_profile({'orcidid': '0000-0003-3041-2092'})

            self.assertFalse(next_task.called)

            retrieve_orcid.return_value = {
                'status': None,
                'name': u'Stern, D K',
                'facts': {
                    u'author': [u'Stern, D', u'Stern, D K', u'Stern, Daniel'],
                    u'orcid_name': [u'Stern, Daniel'],
                    u'author_norm': [u'Stern, D'],
                    u'name': u'Stern, D K'
                },
                'orcidid': u'0000-0003-2686-9241',
                'id': 1,
                'account_id': None,
                'updated': utils.get_date('2017-01-01')
            }

            tasks.task_index_orcid_profile({'orcidid': '0000-0003-3041-2092'})

            self.assertTrue(next_task.called)
            self.assertEqual(next_task.call_count, 4)

            self.assertEqual([(x.bibcode, x.status)
                              for x in insert_claims.call_args[0][0]],
                             [(u'', u'#full-import'), ('Bibcode2', u'claimed'),
                              ('Bibcode3', u'claimed'),
                              ('Bibcode4', u'removed'),
                              ('Bibcode1', u'unchanged')])

            self.assertEqual([(x[0][0]['bibcode'], x[0][0]['status'])
                              for x in next_task.call_args_list],
                             [('Bibcode2', u'claimed'),
                              ('Bibcode3', u'claimed'),
                              ('Bibcode4', u'removed'),
                              ('Bibcode1', u'unchanged')])

            self.assertEqual(
                (next_task.call_args_list[0][0][0]['bibcode'],
                 next_task.call_args_list[0][0][0]['author_list']),
                ('Bibcode2', ['author one', 'Stern, D K']))

            self.assertEqual(
                (next_task.call_args_list[0][0][0]['bibcode'],
                 next_task.call_args_list[0][0][0]['identifiers']),
                ('Bibcode2', ['id1', 'id2']))
Exemplo n.º 8
0
    def test_solr_transformer(self):
        """Makes sure we can write recs into the storage."""

        self.app.update_storage(
            'bibcode',
            'metadata',
            {
                u'abstract':
                u'abstract text',
                u'aff': [u'-', u'-', u'-', u'-'],
                u'alternate_bibcode': [u'2003adass..12..283B'],
                u'author': [
                    u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.',
                    u'Winkelman, S.'
                ],
                u'author_count':
                4,
                u'author_facet':
                [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'],
                u'author_facet_hier': [
                    u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.',
                    u'0/Paltani, S', u'1/Paltani, S/Paltani, S.', u'0/Rots, A',
                    u'1/Rots, A/Rots, A.', u'0/Winkelman, S',
                    u'1/Winkelman, S/Winkelman, S.'
                ],
                u'author_norm':
                [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'],
                u'bibcode':
                u'2003ASPC..295..283B',
                u'bibgroup': [u'CXC', u'CfA'],
                u'bibgroup_facet': [u'CXC', u'CfA'],
                u'bibstem': [u'ASPC', u'ASPC..295'],
                u'bibstem_facet':
                u'ASPC',
                u'database': [u'astronomy'],
                u'date':
                u'2003-01-01T00:00:00.000000Z',
                u'doctype':
                u'inproceedings',
                u'doctype_facet_hier':
                [u'0/Article', u'1/Article/Proceedings Article'],
                u'editor': [u'Testeditor, Z.'],
                u'email': [u'-', u'-', u'-', u'-'],
                u'first_author':
                u'Blecksmith, E.',
                u'first_author_facet_hier':
                [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.'],
                u'first_author_norm':
                u'Blecksmith, E',
                u'id':
                u'1401492',
                u'identifier': [u'2003adass..12..283B'],
                u'links_data':
                u'',  ### TODO(rca): superconfusing string, but fortunately we are getting ridd of it
                u'orcid_pub': [u'-', u'-', u'-', u'-'],
                u'page': [u'283'],
                #u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'],
                u'pub':
                u'Astronomical Data Analysis Software and Systems XII',
                u'pub_raw':
                u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283',
                u'pubdate':
                u'2003-00-00',
                u'title':
                [u'Chandra Data Archive Download and Usage Database'],
                u'volume':
                u'295',
                u'year':
                u'2003'
            })
        self.app.update_storage(
            'bibcode', 'fulltext', {
                'body': 'texttext',
                'acknowledgements': 'aaa',
                'dataset': ['a', 'b', 'c'],
                'facility': ['fac1', 'fac2', 'fac3']
            })
        self.app.update_storage(
            'bibcode', 'metrics', {
                "downloads": [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0,
                    1, 2
                ],
                "bibcode":
                "2003ASPC..295..361M",
                "reads": [
                    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 2, 5, 1, 0, 0, 1, 0, 0, 2,
                    4, 5
                ],
                "author_num":
                2
            })
        self.app.update_storage(
            'bibcode', 'orcid_claims', {
                'authors':
                ['Blecksmith, E.', 'Paltani, S.', 'Rots, A.', 'Winkelman, S.'],
                'bibcode':
                '2003ASPC..295..283B',
                'unverified': ['-', '-', '0000-0003-2377-2356', '-']
            })
        self.app.update_storage(
            'bibcode', 'metrics', {
                u'citation_num':
                6,
                u'citations': [
                    u'2007ApPhL..91g1118P', u'2010ApPhA..99..805K',
                    u'2011TSF...520..610L', u'2012NatCo...3E1175B',
                    u'2014IPTL...26..305A', u'2016ITED...63..197G'
                ]
            })
        self.app.update_storage(
            'bibcode', 'nonbib_data', {
                u'authors': [
                    u'Zaus, E', u'Tedde, S', u'Fuerst, J', u'Henseler, D',
                    u'Doehler, G'
                ],
                u'bibcode':
                u'2007JAP...101d4501Z',
                u'boost':
                0.1899999976158142,
                u'data': [u'MAST:3', u'SIMBAD:1'],
                u'property': [
                    u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE',
                    u'NOT REFEREED'
                ],
                u'downloads': [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
                    0, 0
                ],
                u'id':
                7862455,
                u'norm_cites':
                4225,
                u'reads': [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 2, 1, 0, 0, 1, 0, 1,
                    0, 0
                ],
                u'refereed':
                True,
                u'reference': [
                    u'1977JAP....48.4729M', u'1981psd..book.....S',
                    u'1981wi...book.....S', u'1986PhRvB..33.5545M',
                    u'1987ApPhL..51..913T', u'1992Sci...258.1474S',
                    u'1994IJMPB...8..237S', u'1995Natur.376..498H',
                    u'1995Sci...270.1789Y', u'1998TSF...331...76O',
                    u'1999Natur.397..121F', u'2000JaJAP..39...94P',
                    u'2002ApPhL..81.3885S', u'2004ApPhL..85.3890C',
                    u'2004TSF...451..105S', u'2005PhRvB..72s5208M',
                    u'2006ApPhL..89l3505L'
                ],
                u'simbad_objects': [u'2419335 sim', u'3111723 sim*'],
                u'ned_objects': [u'2419335 HII', u'3111723 ned*'],
                u'grants': [u'2419335 g', u'3111723 g*'],
                u'citation_count':
                6,
                u'citation_count_norm':
                .2,
            })
        rec = self.app.get_record('bibcode')
        x = solr_updater.transform_json_record(rec)
        # self.assertFalse('aff' in x, 'virtual field should not be in solr output')
        self.assertTrue('aff'
                        in x)  # temporarily populating both aff and aff_raw
        self.assertTrue(
            x['aff_raw'] == rec['bib_data']['aff'],
            'solr record should include aff from bib data when augment is not available'
        )
        self.assertFalse(
            'aff_abbrev' in x,
            'augment field should not be in solr record when augment is not available'
        )

        self.app.update_storage(
            'bibcode', 'augment', {
                u'aff': [u'augment pipeline aff', u'-', u'-', u'-'],
                u'aff_abbrev': [u'-', u'-', u'-', u'-'],
                u'aff_canonical': [u'-', u'-', u'-', u'-'],
                u'aff_facet': [u'-', u'-', u'-', u'-'],
                u'aff_facet_hier': [u'-', u'-', u'-', u'-'],
                u'aff_id': [u'-', u'-', u'-', u'-'],
                u'aff_raw': [u'augment pipeline aff', u'-', u'-', u'-'],
                u'institution': [u'-', u'-', u'-', u'-']
            })

        rec = self.app.get_record('bibcode')
        self.assertDictContainsSubset(
            {
                u'abstract':
                u'abstract text',
                u'ack':
                u'aaa',
                u'aff_abbrev': [u'-', u'-', u'-', u'-'],
                u'aff_canonical': [u'-', u'-', u'-', u'-'],
                u'aff_facet': [u'-', u'-', u'-', u'-'],
                u'aff_facet_hier': [u'-', u'-', u'-', u'-'],
                u'aff_id': [u'-', u'-', u'-', u'-'],
                u'institution': [u'-', u'-', u'-', u'-'],
                u'alternate_bibcode': [u'2003adass..12..283B'],
                u'author': [
                    u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.',
                    u'Winkelman, S.'
                ],
                u'author_count':
                4,
                u'author_facet':
                [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'],
                u'author_facet_hier': [
                    u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.',
                    u'0/Paltani, S', u'1/Paltani, S/Paltani, S.', u'0/Rots, A',
                    u'1/Rots, A/Rots, A.', u'0/Winkelman, S',
                    u'1/Winkelman, S/Winkelman, S.'
                ],
                u'author_norm':
                [u'Blecksmith, E', u'Paltani, S', u'Rots, A', u'Winkelman, S'],
                'bibcode':
                u'2003ASPC..295..283B',
                u'bibgroup': [u'CXC', u'CfA'],
                u'bibgroup_facet': [u'CXC', u'CfA'],
                u'bibstem': [u'ASPC', u'ASPC..295'],
                u'bibstem_facet':
                u'ASPC',
                'body':
                u'texttext',
                'citation': [
                    u'2007ApPhL..91g1118P', u'2010ApPhA..99..805K',
                    u'2011TSF...520..610L', u'2012NatCo...3E1175B',
                    u'2014IPTL...26..305A', u'2016ITED...63..197G'
                ],
                'citation_count':
                6,
                'citation_count_norm':
                .2,
                'cite_read_boost':
                0.1899999976158142,
                u'data': [u'MAST:3', u'SIMBAD:1'],
                u'data_facet': [u'MAST', u'SIMBAD'],
                u'database': [u'astronomy'],
                #u'dataset': ['a', 'b', 'c'],
                u'date':
                u'2003-01-01T00:00:00.000000Z',
                u'doctype':
                u'inproceedings',
                u'doctype_facet_hier':
                [u'0/Article', u'1/Article/Proceedings Article'],
                u'editor': [u'Testeditor, Z.'],
                u'email': [u'-', u'-', u'-', u'-'],
                u'facility': ['fac1', 'fac2', 'fac3'],
                u'first_author':
                u'Blecksmith, E.',
                u'first_author_facet_hier':
                [u'0/Blecksmith, E', u'1/Blecksmith, E/Blecksmith, E.'],
                u'first_author_norm':
                u'Blecksmith, E',
                u'id':
                1,  # from id in master database records table
                u'identifier': [u'2003adass..12..283B'],
                u'links_data':
                u'',
                'orcid_other': [u'-', u'-', u'0000-0003-2377-2356', u'-'],
                u'orcid_pub': [u'-', u'-', u'-', u'-'],
                u'nedid': [u'2419335', u'3111723'],
                u'nedtype': [u'HII Region', u'Other'],
                u'ned_object_facet_hier': [
                    u'0/HII Region', u'1/HII Region/2419335', u'0/Other',
                    u'1/Other/3111723'
                ],
                u'page': [u'283'],
                u'property': [
                    u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE',
                    u'NOT REFEREED'
                ],
                u'pub':
                u'Astronomical Data Analysis Software and Systems XII',
                u'pub_raw':
                u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283',
                u'pubdate':
                u'2003-00-00',
                u'read_count':
                0,
                'reference': [
                    u'1977JAP....48.4729M', u'1981psd..book.....S',
                    u'1981wi...book.....S', u'1986PhRvB..33.5545M',
                    u'1987ApPhL..51..913T', u'1992Sci...258.1474S',
                    u'1994IJMPB...8..237S', u'1995Natur.376..498H',
                    u'1995Sci...270.1789Y', u'1998TSF...331...76O',
                    u'1999Natur.397..121F', u'2000JaJAP..39...94P',
                    u'2002ApPhL..81.3885S', u'2004ApPhL..85.3890C',
                    u'2004TSF...451..105S', u'2005PhRvB..72s5208M',
                    u'2006ApPhL..89l3505L'
                ],
                u'simbid': ['2419335', '3111723'],
                u'simbtype': [u'Other', u'Star'],
                u'simbad_object_facet_hier':
                [u'0/Other', u'1/Other/2419335', u'0/Star', u'1/Star/3111723'],
                u'title':
                [u'Chandra Data Archive Download and Usage Database'],
                u'volume':
                u'295',
                u'year':
                u'2003'
            },
            solr_updater.transform_json_record(rec))

        for x in Records._date_fields:
            if x in rec:
                rec[x] = get_date('2017-09-19T21:17:12.026474+00:00')

        x = solr_updater.transform_json_record(rec)
        for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime',
                  'nonbib_mtime', 'metrics_mtime', 'update_timestamp'):
            self.assertEqual(x[f], '2017-09-19T21:17:12.026474Z')

        rec['orcid_claims_updated'] = get_date(
            '2017-09-20T21:17:12.026474+00:00')
        x = solr_updater.transform_json_record(rec)
        for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime',
                  'nonbib_mtime', 'metrics_mtime', 'update_timestamp'):
            if f == 'update_timestamp' or f == 'orcid_mtime':
                self.assertEqual(x[f], '2017-09-20T21:17:12.026474Z')
            else:
                self.assertEqual(x[f], '2017-09-19T21:17:12.026474Z')

        rec = self.app.get_record('bibcode')
        x = solr_updater.transform_json_record(rec)
        # self.assertFalse('aff' in x)  #  virtual field should not be in solr output
        self.assertTrue('aff' in x)  #  aff is no longer a virtual field
        self.assertEqual(
            x['aff_raw'], rec['augments']
            ['aff'])  # solr record should prioritize aff data from augment
        self.assertEqual(
            x['aff_abbrev'], rec['augments']
            ['aff_abbrev'])  # solr record should include augment data
Exemplo n.º 9
0
def task_process_myads(message):
    """
    Process the myADS notifications for a given user

    :param message: contains the message inside the packet
        {
         'userid': adsws user ID,
         'frequency': 'daily' or 'weekly',
         'force': Boolean (if present, we'll reprocess myADS notifications for the user,
            even if they were already processed today)
         'test_send_to': email address to send output to, if not that of the user (for testing)
         'retries': number of retries attempted
        }
    :return: no return
    """

    if 'userid' not in message:
        logger.error('No user ID received for {0}'.format(message))
        return
    if 'frequency' not in message:
        logger.error('No frequency received for {0}'.format(message))
        return

    userid = message['userid']
    with app.session_scope() as session:
        try:
            q = session.query(AuthorInfo).filter_by(id=userid).one()
            last_sent = q.last_sent
        except ormexc.NoResultFound:
            author = AuthorInfo(id=userid,
                                created=adsputils.get_date(),
                                last_sent=None)
            session.add(author)
            session.flush()
            last_sent = author.last_sent
            session.commit()
        if message['frequency'] == 'daily' and last_sent and last_sent.date(
        ) == adsputils.get_date().date():
            # already sent email today
            if not message['force']:
                logger.warning(
                    'Email for user {0} already sent today'.format(userid))
                return
            else:
                logger.info(
                    'Email for user {0} already sent today, but force mode is on'
                    .format(userid))

    # first fetch the myADS setup from /vault/get-myads
    r = app.client.get(app.conf.get('API_VAULT_MYADS_SETUP') % userid,
                       headers={
                           'Accept':
                           'application/json',
                           'Authorization':
                           'Bearer {0}'.format(app.conf.get('API_TOKEN'))
                       })

    if r.status_code != 200:
        if message.get('retries', None):
            retries = message['retries']
        else:
            retries = 0
        if retries < app.conf.get('TOTAL_RETRIES', 3):
            message['retries'] = retries + 1
            task_process_myads.apply_async(args=(message, ),
                                           countdown=app.conf.get(
                                               'MYADS_RESEND_WINDOW', 3600))
            logger.warning(
                'Failed getting myADS setup for {0}; will try again later. Retry {1}'
                .format(userid, retries))
            return
        else:
            logger.warning(
                'Maximum number of retries attempted for {0}. myADS processing failed.'
                .format(userid))
            return

    if message.get('test_bibcode', None):
        # check that the solr searcher we're getting is still ok by querying for the test bibcode
        q = app.client.get(
            '{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format(
                app.conf.get('API_SOLR_QUERY_ENDPOINT'),
                message.get('test_bibcode')),
            headers={'Authorization': 'Bearer ' + app.conf.get('API_TOKEN')})

        fail = True
        if q.status_code != 200:
            logger.warning(
                'Error retrieving the test bibcode {0} from solr while processing for user {1}. Retrying'
                .format(message.get('test_bibcode'), userid))
        elif q.json()['response']['numFound'] == 0:
            logger.warning(
                'Test bibcode {0} not found in solr while processing for user {1}. Retrying'
                .format(message.get('test_bibcode'), userid))
        else:
            fail = False

        if fail:
            if message.get('solr_retries', None):
                retries = message['solr_retries']
            else:
                retries = 0
            if retries < app.conf.get('TOTAL_RETRIES', 3):
                message['solr_retries'] = retries + 1
                task_process_myads.apply_async(args=(message, ),
                                               countdown=app.conf.get(
                                                   'MYADS_SOLR_RESEND_WINDOW',
                                                   3600))
                logger.warning(
                    'Solr error occurred while processing myADS email for user {0}; rerunning. Retry {1}'
                    .format(userid, retries))
                return
            else:
                logger.warning(
                    'Maximum number of retries attempted for {0}. myADS processing failed: '
                    'solr searchers were not updated.'.format(userid))
                return

    # then execute each qid /vault/execute-query/qid
    setup = r.json()
    payload = []
    for s in setup:
        if s['frequency'] == message['frequency']:
            # only return 5 results, unless it's the daily arXiv posting, then return max
            # TODO should all stateful queries return all results or will this be overwhelming for some? well-cited
            # users can get 40+ new cites in one weekly astro update
            if s['frequency'] == 'daily':
                s['rows'] = app.conf.get('MAX_NUM_ROWS_DAILY', 2000)
            else:
                s['rows'] = app.conf.get('MAX_NUM_ROWS_WEEKLY', 5)
            s['fields'] = 'bibcode,title,author_norm,identifier,year,bibstem'
            if s['type'] == 'query':
                qtype = 'general'
            elif s['type'] == 'template':
                qtype = s['template']
            else:
                logger.warning(
                    'Wrong query type passed for query {0}, user {1}'.format(
                        s, userid))
                continue

            try:
                raw_results = utils.get_template_query_results(s)
            except RuntimeError:
                if message.get('query_retries', None):
                    retries = message['query_retries']
                else:
                    retries = 0
                if retries < app.conf.get('TOTAL_RETRIES', 3):
                    message['query_retries'] = retries + 1
                    logger.warning(
                        'Error getting template query results for user {0}. Retrying. '
                        'Retry:'.format(userid, retries))
                    task_process_myads.apply_async(args=(message, ),
                                                   countdown=app.conf.get(
                                                       'MYADS_RESEND_WINDOW',
                                                       3600))
                    return
                else:
                    logger.warning(
                        'Maximum number of query retries attempted for user {0}; myADS processing '
                        'failed due to retrieving query results failures.'.
                        format(userid))

            for r in raw_results:
                # for stateful queries, remove previously seen results, store new results
                if s['stateful']:
                    docs = r['results']
                    bibcodes = [doc['bibcode'] for doc in docs]
                    if s.get('qid', None):
                        good_bibc = app.get_recent_results(
                            user_id=userid,
                            qid=s['qid'],
                            input_results=bibcodes,
                            ndays=app.conf.get('STATEFUL_RESULTS_DAYS', 7))
                    else:
                        good_bibc = app.get_recent_results(
                            user_id=userid,
                            setup_id=s['id'],
                            input_results=bibcodes,
                            ndays=app.conf.get('STATEFUL_RESULTS_DAYS', 7))
                    results = [
                        doc for doc in docs if doc['bibcode'] in good_bibc
                    ]
                else:
                    results = r['results']

                payload.append({
                    'name': r['name'],
                    'query_url': r['query_url'],
                    'results': results,
                    'query': r['query'],
                    'qtype': qtype,
                    'id': s['id']
                })
        else:
            # wrong frequency for this round of processing
            continue

    if len(payload) == 0:
        logger.info(
            'No payload for user {0} for the {1} email. No email was sent.'.
            format(userid, message['frequency']))
        return

    # if test email address provided, send there; otherwise fetch user email address
    if message.get('test_send_to', None):
        email = message.get('test_send_to')
    else:
        email = utils.get_user_email(userid=userid)

    if message['frequency'] == 'daily':
        subject = 'Daily myADS Notification'
    else:
        subject = 'Weekly myADS Notification'

    payload_plain = utils.payload_to_plain(payload)
    if len(payload) < app.conf.get('NUM_QUERIES_TWO_COL', 3):
        payload_html = utils.payload_to_html(payload,
                                             col=1,
                                             frequency=message['frequency'],
                                             email_address=email)
    else:
        payload_html = utils.payload_to_html(payload,
                                             col=2,
                                             frequency=message['frequency'],
                                             email_address=email)
    msg = utils.send_email(email_addr=email,
                           email_template=myADSTemplate,
                           payload_plain=payload_plain,
                           payload_html=payload_html,
                           subject=subject)

    if msg:
        # update author table w/ last sent datetime
        with app.session_scope() as session:
            q = session.query(AuthorInfo).filter_by(id=userid).one()
            # should we set "last_sent" for both weekly and daily queries?
            q.last_sent = adsputils.get_date()

            session.commit()

    else:
        if message.get('send_retries', None):
            retries = message['send_retries']
        else:
            retries = 0
        if retries < app.conf.get('TOTAL_RETRIES', 3):
            message['send_retries'] = retries + 1
            task_process_myads.apply_async(args=(message, ),
                                           countdown=app.conf.get(
                                               'MYADS_RESEND_WINDOW', 3600))
            logger.warning(
                'Error sending myADS email for user {0}, email {1}; rerunning. Retry {2}'
                .format(userid, email, retries))
            return
        else:
            logger.warning(
                'Maximum number of retries attempted for {0}. myADS processing failed at sending the email.'
                .format(userid))
            return
Exemplo n.º 10
0
    def test_get_claims(self):
        """Check the correct logic for discovering difference in the orcid profile."""

        orcidid = '0000-0003-3041-2092'
        httpretty.register_uri(
            httpretty.POST,
            self.app.conf['API_ORCID_UPDATE_BIB_STATUS'] % orcidid,
            content_type='application/json',
            status=200,
            body=json.dumps({'2020..............A': 'verified'}))

        def side_effect(x, search_identifiers=False):
            if len(x) == 19:
                return {'bibcode': x}
            else:
                return None
        with mock.patch.object(self.app, 'retrieve_orcid',
                return_value={'status': None, 'updated': None, 'name': None, 'created': '2009-09-03T20:56:35.450686+00:00',
                              'facts': {}, 'orcidid': orcidid, 'id': 1, 'account_id': None} ) as harvest_author_info, \
            mock.patch.object(self.app, '_get_ads_orcid_profile',
                return_value=json.loads(open(os.path.join(self.app.conf['TEST_DIR'], 'stub_data', orcidid + '.ads.json')).read())) as _, \
            mock.patch.object(self.app, 'retrieve_metadata', side_effect=side_effect) as retrieve_metadata:

            orcid_present, updated, removed = self.app.get_claims(
                orcidid,
                self.app.conf.get('API_TOKEN'),
                self.app.conf.get('API_ORCID_EXPORT_PROFILE') % orcidid,
                force=False,
                orcid_identifiers_order=self.app.conf.get(
                    'ORCID_IDENTIFIERS_ORDER', {
                        'bibcode': 9,
                        '*': -1
                    }))
            assert len(orcid_present) == 9 and len(updated) == 0 and len(
                removed) == 0

            # pretend that we have already ran the import
            cdate = utils.get_date(
                '2017-07-18 14:46:09.879000+00:00'
            )  # this is the latest moddate from the orcid profile
            self.app.insert_claims([
                self.app.create_claim(bibcode='',
                                      orcidid=orcidid,
                                      provenance='OrcidImporter',
                                      status='#full-import',
                                      date=cdate)
            ])

            # it should ignore the next call
            orcid_present, updated, removed = self.app.get_claims(
                orcidid,
                self.app.conf.get('API_TOKEN'),
                self.app.conf.get('API_ORCID_EXPORT_PROFILE') % orcidid,
                force=False,
                orcid_identifiers_order=self.app.conf.get(
                    'ORCID_IDENTIFIERS_ORDER', {
                        'bibcode': 9,
                        '*': -1
                    }))
            assert len(orcid_present) == 0 and len(updated) == 0 and len(
                removed) == 0

            # but if we force it, it must not ignore use...
            orcid_present, updated, removed = self.app.get_claims(
                orcidid,
                self.app.conf.get('API_TOKEN'),
                self.app.conf.get('API_ORCID_EXPORT_PROFILE') % orcidid,
                force=True,
                orcid_identifiers_order=self.app.conf.get(
                    'ORCID_IDENTIFIERS_ORDER', {
                        'bibcode': 9,
                        '*': -1
                    }))
            #print len(orcid_present), len(updated), len(removed)
            assert len(orcid_present) == 9 and len(updated) == 0 and len(
                removed) == 0

        # test backwards compatibility in get_claims with old ORCID API
        with mock.patch.object(self.app, 'retrieve_orcid',
                return_value={'status': None, 'updated': None, 'name': None, 'created': '2009-09-03T20:56:35.450686+00:00',
                              'facts': {}, 'orcidid': orcidid, 'id': 1, 'account_id': None} ) as harvest_author_info, \
            mock.patch.object(self.app, '_get_ads_orcid_profile',
                return_value=json.loads(open(os.path.join(self.app.conf['TEST_DIR'], 'stub_data', orcidid + '.ads_1.2.json')).read())) as _, \
            mock.patch.object(self.app, 'retrieve_metadata', side_effect=side_effect) as retrieve_metadata:

            orcid_present, updated, removed = self.app.get_claims(
                orcidid,
                self.app.conf.get('API_TOKEN'),
                self.app.conf.get('API_ORCID_EXPORT_PROFILE') % orcidid,
                force=False,
                orcid_identifiers_order=self.app.conf.get(
                    'ORCID_IDENTIFIERS_ORDER', {
                        'bibcode': 9,
                        '*': -1
                    }))
            assert len(orcid_present) == 7 and len(updated) == 0 and len(
                removed) == 0

            # pretend that we have already ran the import
            cdate = utils.get_date(
                '2015-11-05 16:37:33.381000+00:00'
            )  # this is the latest moddate from the orcid profile
            self.app.insert_claims([
                self.app.create_claim(bibcode='',
                                      orcidid=orcidid,
                                      provenance='OrcidImporter',
                                      status='#full-import',
                                      date=cdate)
            ])

            # it should ignore the next call
            orcid_present, updated, removed = self.app.get_claims(
                orcidid,
                self.app.conf.get('API_TOKEN'),
                self.app.conf.get('API_ORCID_EXPORT_PROFILE') % orcidid,
                force=False,
                orcid_identifiers_order=self.app.conf.get(
                    'ORCID_IDENTIFIERS_ORDER', {
                        'bibcode': 9,
                        '*': -1
                    }))
            assert len(orcid_present) == 0 and len(updated) == 0 and len(
                removed) == 0

            # but if we force it, it must not ignore use...
            orcid_present, updated, removed = self.app.get_claims(
                orcidid,
                self.app.conf.get('API_TOKEN'),
                self.app.conf.get('API_ORCID_EXPORT_PROFILE') % orcidid,
                force=True,
                orcid_identifiers_order=self.app.conf.get(
                    'ORCID_IDENTIFIERS_ORDER', {
                        'bibcode': 9,
                        '*': -1
                    }))
            # print len(orcid_present), len(updated), len(removed)
            assert len(orcid_present) == 7 and len(updated) == 0 and len(
                removed) == 0
Exemplo n.º 11
0
def reindex(since=None,
            batch_size=None,
            force_indexing=False,
            update_solr=True,
            update_metrics=True,
            update_links=True,
            force_processing=False,
            ignore_checksums=False):
    """
    Initiates routing of the records (everything that was updated)
    since point in time T.
    """
    if force_indexing:
        key = 'last.reindex.forced'
    else:
        key = 'last.reindex.normal'

    if update_solr and update_metrics:
        pass  # default
    elif update_solr:
        key = key + '.solr-only'
    else:
        key = key + '.metrics-only'

    previous_since = None
    now = get_date()
    if since is None:
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key=key).first()
            if kv is None:
                since = get_date('1972')
                kv = KeyValue(key=key, value=now.isoformat())
                session.add(kv)
            else:
                since = get_date(kv.value)
                previous_since = since
                kv.value = now.isoformat()
            session.commit()
    else:
        since = get_date(since)

    logger.info('Sending records changed since: %s', since.isoformat())
    sent = 0
    last_bibcode = None
    year_zero = adsputils.get_date('1972')

    try:
        # select everything that was updated since
        batch = []
        with app.session_scope() as session:
            for rec in session.query(Records) \
                .filter(Records.updated >= since) \
                .options(load_only(Records.bibcode, Records.updated, Records.processed)) \
                .yield_per(100):

                if rec.processed is None:
                    processed = year_zero
                else:
                    processed = get_date(rec.processed)
                updated = get_date(rec.updated)

                if not force_processing and processed > updated:
                    continue  # skip records that were already processed

                sent += 1
                if sent % 1000 == 0:
                    logger.debug('Sending %s records', sent)

                if not batch_size or batch_size < 0:
                    batch.append(rec.bibcode)
                elif batch_size > len(batch):
                    batch.append(rec.bibcode)
                else:
                    batch.append(rec.bibcode)
                    tasks.task_index_records.delay(
                        batch,
                        force=force_indexing,
                        update_solr=update_solr,
                        update_metrics=update_metrics,
                        update_links=update_links,
                        ignore_checksums=ignore_checksums)
                    batch = []
                    last_bibcode = rec.bibcode

        if len(batch) > 0:
            tasks.task_index_records.delay(batch,
                                           force=force_indexing,
                                           update_solr=update_solr,
                                           update_metrics=update_metrics,
                                           commit=force_indexing,
                                           ignore_checksums=ignore_checksums)
        elif force_indexing and last_bibcode:
            # issue one extra call with the commit
            tasks.task_index_records.delay([last_bibcode],
                                           force=force_indexing,
                                           update_solr=update_solr,
                                           update_metrics=update_metrics,
                                           commit=force_indexing,
                                           ignore_checksums=ignore_checksums)

        logger.info('Done processing %s records', sent)
    except Exception, e:
        if previous_since:
            logger.error(
                'Failed while submitting data to pipeline, resetting timestamp back to: %s',
                previous_since)
            with app.session_scope() as session:
                kv = session.query(KeyValue).filter_by(key=key).first()
                kv.value = previous_since
                session.commit()
        else:
            logger.error('Failed while submitting data to pipeline')
        raise e
Exemplo n.º 12
0
 def toJSON(self):
     return {'id': self.id, 'orcidid': self.orcidid,
             'bibcode': self.bibcode, 'status': self.status,
             'provenance': unicode(self.provenance), 'created': self.created and get_date(self.created).isoformat() or None
             }
Exemplo n.º 13
0
 def toJSON(self):
     return {'id': self.id, 'orcidid': self.orcidid,
             'name': self.name, 'facts': self.facts and json.loads(self.facts) or {},
             'status': self.status, 'account_id': self.account_id,
             'created': self.created and get_date(self.created).isoformat() or None, 'updated': self.updated and get_date(self.updated).isoformat() or None
             }
Exemplo n.º 14
0
def task_index_orcid_profile(message):
    """
    Fetch a fresh profile from the orcid-service and compare
    it against the state of the storage (diff). And re-index/update
    them.
    

    :param message: contains the message inside the packet
        {
         'orcidid': '.....',
         'start': 'ISO8801 formatted date (optional), indicates 
             the moment we checked the orcid-service'
         'force': Boolean (if present, we'll not skip unchanged
             profile)
        }
    :return: no return
    """

    if 'orcidid' not in message:
        raise IgnorableException('Received garbage: {}'.format(message))

    message['start'] = adsputils.get_date()
    orcidid = message['orcidid']
    author = app.retrieve_orcid(orcidid)

    # update profile table in microservice
    r = requests.get(app.conf.get('API_ORCID_UPDATE_PROFILE') % orcidid,
                     headers={
                         'Accept':
                         'application/json',
                         'Authorization':
                         'Bearer {0}'.format(app.conf.get('API_TOKEN'))
                     })
    if r.status_code != 200:
        logger.warning('Profile for {0} not updated.'.format(orcidid))

    orcid_present, updated, removed = app.get_claims(
        orcidid,
        app.conf.get('API_TOKEN'),
        app.conf.get('API_ORCID_EXPORT_PROFILE') % orcidid,
        force=message.get('force', False),
        orcid_identifiers_order=app.conf.get('ORCID_IDENTIFIERS_ORDER', {
            'bibcode': 9,
            '*': -1
        }))

    to_claim = []

    #always insert a record that marks the beginning of a full-import
    #TODO: record orcid's last-modified-date
    to_claim.append(
        app.create_claim(bibcode='',
                         orcidid=orcidid,
                         provenance='OrcidImporter',
                         status='#full-import',
                         date=adsputils.get_date()))

    # find difference between what we have and what orcid has
    claims_we_have = set(updated.keys()).difference(set(removed.keys()))
    claims_orcid_has = set(orcid_present.keys())

    # those guys will be added (with ORCID date signature)
    for c in claims_orcid_has.difference(claims_we_have):
        claim = orcid_present[c]
        to_claim.append(
            app.create_claim(bibcode=claim[0],
                             orcidid=orcidid,
                             provenance=claim[2],
                             status='claimed',
                             date=claim[1]))

    # those guys will be removed (since orcid doesn't have them)
    for c in claims_we_have.difference(claims_orcid_has):
        claim = updated[c]
        to_claim.append(
            app.create_claim(bibcode=claim[0],
                             orcidid=orcidid,
                             provenance='OrcidImporter',
                             status='removed'))

    # and those guys will be updated if their creation date is significantly off
    for c in claims_orcid_has.intersection(claims_we_have):

        orcid_claim = orcid_present[c]
        ads_claim = updated[c]

        delta = orcid_claim[1] - ads_claim[1]
        if delta.total_seconds() > app.conf.get('ORCID_UPDATE_WINDOW', 60):
            to_claim.append(
                app.create_claim(bibcode=orcid_claim[0],
                                 orcidid=orcidid,
                                 provenance='OrcidImporter',
                                 status='updated',
                                 date=orcid_claim[1]))
        elif message.get('force', False):
            to_claim.append(
                app.create_claim(bibcode=orcid_claim[0],
                                 orcidid=orcidid,
                                 provenance='OrcidImporter',
                                 status='forced',
                                 date=orcid_claim[1]))
        else:
            to_claim.append(
                app.create_claim(bibcode=orcid_claim[0],
                                 orcidid=orcidid,
                                 provenance='OrcidImporter',
                                 status='unchanged',
                                 date=orcid_claim[1]))

    if len(to_claim):
        # create record in the database
        json_claims = app.insert_claims(to_claim)
        if author['status'] in ('blacklisted', 'postponed'):
            return
        # set to the queue for processing
        for claim in json_claims:
            if claim.get('bibcode'):
                claim['bibcode_verified'] = True
                claim['name'] = author['name']
                if author.get('facts', None):
                    for k, v in author['facts'].iteritems():
                        claim[k] = v

                claim['author_status'] = author['status']
                claim['account_id'] = author['account_id']
                claim['author_updated'] = author['updated']
                claim['author_id'] = author['id']

                if claim.get('status') != 'removed':
                    claim['identifiers'] = orcid_present[claim.get(
                        'bibcode').lower().strip()][3]
                    claim['author_list'] = orcid_present[claim.get(
                        'bibcode').lower().strip()][4]

                task_match_claim.delay(claim)
Exemplo n.º 15
0
def task_check_orcid_updates(msg):
    """Check the orcid microservice for updated orcid profiles.
    
    This function is somewhat complex
    we are trying to defend against multiple executions (assuming 
    that there is many workers and each of them can receive its own
    signal to start processing). 
    
    Basically, we'll only want to check for updated profiles once.
    The synchronization is done via a database. So the worker
    updates the 'last.check' timestamp immediately (and we
    'optimistically' hope that it will be enough to prevent clashes;
    well - even if that is not a strong guarantee, it wouldn't be 
    a tragedy if a profile is checked twice...)
    
    Additional difficulty is time synchronization: the worker can 
    be executed as often as you like, but it will refuse to do any
    work unless the time window between the checks is large enough.
    """

    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.check').first()
        if kv is None:
            kv = KeyValue(key='last.check',
                          value='1974-11-09T22:56:52.518001Z')  #force update

        latest_point = adsputils.get_date(kv.value)  # RFC 3339 format
        now = adsputils.get_date()

        total_wait = app.conf.get('ORCID_CHECK_FOR_CHANGES',
                                  60 * 5)  #default is 5min
        delta = now - latest_point

        if delta.total_seconds() < total_wait:
            # register our own execution in the future
            task_check_orcid_updates.apply_async(
                args=(msg, ),
                countdown=(total_wait - delta.total_seconds()) + 1)
        else:
            logger.info("Checking for orcid updates")

            # increase the timestamp by one microsec and get new updates
            latest_point = latest_point + datetime.timedelta(microseconds=1)
            r = requests.get(app.conf.get('API_ORCID_UPDATES_ENDPOINT') %
                             latest_point.isoformat(),
                             params={
                                 'fields': ['orcid_id', 'updated', 'created']
                             },
                             headers={
                                 'Authorization':
                                 'Bearer {0}'.format(app.conf.get('API_TOKEN'))
                             })

            if r.status_code != 200:
                logger.error('Failed getting {0}\n{1}'.format(
                    app.conf.get('API_ORCID_UPDATES_ENDPOINT') % kv.value,
                    r.text))
                msg['errcount'] = msg.get('errcount', 0) + 1

                # schedule future execution offset by number of errors (rca: do exponential?)
                task_check_orcid_updates.apply_async(
                    args=(msg, ),
                    countdown=total_wait + total_wait * msg['errcount'])
                return

            if r.text.strip() == "":
                return task_check_orcid_updates.apply_async(
                    args=(msg, ), countdown=total_wait)

            data = r.json()

            if len(data) == 0:
                return task_check_orcid_updates.apply_async(
                    args=(msg, ), countdown=total_wait)

            msg['errcount'] = 0  # success, we got data from the api, reset the counter

            # we received the data, immediately update the databaes (so that other processes don't
            # ask for the same starting date)
            # data should be ordered by date updated (but to be sure, let's check it); we'll save it
            # as latest 'check point'
            dates = [adsputils.get_date(x['updated']) for x in data]
            dates = sorted(dates, reverse=True)

            kv.value = dates[0].isoformat()
            session.merge(kv)
            session.commit()

            for rec in data:
                payload = {
                    'orcidid': rec['orcid_id'],
                    'start': latest_point.isoformat()
                }
                task_index_orcid_profile.delay(payload)

            # recheck again
            task_check_orcid_updates.apply_async(args=(msg, ),
                                                 countdown=total_wait)
Exemplo n.º 16
0
 def process_bind_param(self, value, engine):
     if isinstance(value, basestring):
         return get_date(value).astimezone(tzutc())
     elif value is not None:
         return value.astimezone(tzutc()) # will raise Error is not datetime
Exemplo n.º 17
0
def repush_claims(since=None, orcid_ids=None, **kwargs):
    """
    Re-pushes all recs that were added since date 'X'
    to the output (i.e. forwards them onto the Solr queue)

    :param: since - RFC889 formatted string
    :type: str

    :return: no return
    """
    if orcid_ids:
        for oid in orcid_ids:
            tasks.task_index_orcid_profile.delay({
                'orcidid': oid,
                'force': False
            })
        if not since:
            print 'Done (just the supplied orcidids)'
            return

    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.repush').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z'

    from_date = get_date(since)
    orcidids = set()

    logger.info('Re-pushing records since: {0}'.format(from_date.isoformat()))

    num_bibcodes = 0
    with app.session_scope() as session:
        for rec in session.query(Records) \
            .filter(Records.updated >= from_date) \
            .order_by(Records.updated.asc()) \
            .all():

            data = rec.toJSON()
            try:
                tasks.task_output_results.delay({
                    'bibcode': data['bibcode'],
                    'authors': data['authors'],
                    'claims': data['claims']
                })
            except:  # potential backpressure (we are too fast)
                time.sleep(2)
                print 'Conn problem, retrying ', data['bibcode']
                tasks.task_output_results.delay({
                    'bibcode': data['bibcode'],
                    'authors': data['authors'],
                    'claims': data['claims']
                })
            num_bibcodes += 1

    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.repush').first()
        if kv is None:
            kv = KeyValue(key='last.repush', value=get_date())
            session.add(kv)
        else:
            kv.value = get_date()
        session.commit()

    logger.info('Done processing {0} orcid ids.'.format(num_bibcodes))
Exemplo n.º 18
0
def reindex_claims(since=None, orcid_ids=None, **kwargs):
    """
    Re-runs all claims, both from the pipeline and
    from the orcid-service storage.

    :param: since - RFC889 formatted string
    :type: str

    :return: no return
    """
    if orcid_ids:
        for oid in orcid_ids:
            tasks.task_index_orcid_profile.delay({
                'orcidid': oid,
                'force': True
            })
        if not since:
            print 'Done (just the supplied orcidids)'
            return

    logging.captureWarnings(True)
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.reindex').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1974-11-09T22:56:52.518001Z'

    from_date = get_date(since)
    orcidids = set()

    logger.info('Loading records since: {0}'.format(from_date.isoformat()))

    # first re-check our own database (replay the logs)
    with app.session_scope() as session:
        for author in session.query(
                AuthorInfo.orcidid.distinct().label('orcidid')).all():
            orcidid = author.orcidid
            if orcidid and orcidid.strip() != "":
                try:
                    changed = updater.reindex_all_claims(
                        app,
                        orcidid,
                        since=from_date.isoformat(),
                        ignore_errors=True)
                    if len(changed):
                        orcidids.add(orcidid)
                    tasks.task_index_orcid_profile.delay({
                        'orcidid': orcidid,
                        'force': True
                    })
                except:
                    print 'Error processing: {0}'.format(orcidid)
                    traceback.print_exc()
                    continue
                if len(orcidids) % 100 == 0:
                    print 'Done replaying {0} profiles'.format(len(orcidids))

    print 'Now harvesting orcid profiles...'

    # then get all new/old orcidids from orcid-service
    all_orcids = set(
        updater.get_all_touched_profiles(app, from_date.isoformat()))
    orcidids = all_orcids.difference(orcidids)
    from_date = get_date()

    for orcidid in orcidids:
        try:
            tasks.task_index_orcid_profile.delay({
                'orcidid': orcidid,
                'force': True
            })
        except:  # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', orcidid
            tasks.task_index_orcid_profile.delay({
                'orcidid': orcidid,
                'force': True
            })

    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.reindex').first()
        if kv is None:
            kv = KeyValue(key='last.reindex', value=from_date.isoformat())
            session.add(kv)
        else:
            kv.value = from_date.isoformat()
        session.commit()

    print 'Done'
    logger.info('Done submitting {0} orcid ids.'.format(len(orcidids)))
Exemplo n.º 19
0
 def _entry_date(ADS_record):
   d = ADS_record.get('entry_date', None)     
   return {'entry_date': date2solrstamp(d and get_date(d) or get_date())}
Exemplo n.º 20
0
def task_index_records(bibcodes,
                       force=False,
                       update_solr=True,
                       update_metrics=True,
                       update_links=True,
                       commit=False,
                       ignore_checksums=False):
    """
    This task is (normally) called by the cronjob task
    (that one, quite obviously, is in turn started by cron)
    
    Receives the bibcode of a document that was updated.
    (note: we could have sent the full record however we don't
    do it because the messages might be delayed and we can have
    multiple workers updating the same record; so we want to
    look into the database and get the most recent version)


    Receives bibcodes and checks the database if we have all the
    necessary pieces to push to solr. If not, then postpone and
    push later.

    We consider a record to be 'ready' if those pieces were updated
    (and were updated later than the last 'processed' timestamp):

        - bib_data
        - nonbib_data
        - orcid_claims

    'fulltext' is not considered essential; but updates to fulltext will
    trigger a solr_update (so it might happen that a document will get
    indexed twice; first with only metadata and later on incl fulltext)

    """

    if isinstance(bibcodes, basestring):
        bibcodes = [bibcodes]

    if not (update_solr or update_metrics or update_links):
        raise Exception('Hmmm, I dont think I let you do NOTHING, sorry!')

    logger.debug('Running index-records for: %s', bibcodes)
    batch = []
    batch_insert = []
    batch_update = []
    links_data = []
    links_url = app.conf.get('LINKS_RESOLVER_UPDATE_URL')

    #check if we have complete record
    for bibcode in bibcodes:
        r = app.get_record(bibcode)

        if r is None:
            logger.error('The bibcode %s doesn\'t exist!', bibcode)
            continue

        bib_data_updated = r.get('bib_data_updated', None)
        orcid_claims_updated = r.get('orcid_claims_updated', None)
        nonbib_data_updated = r.get('nonbib_data_updated', None)
        fulltext_updated = r.get('fulltext_updated', None)
        metrics_updated = r.get('metrics_updated', None)

        year_zero = '1972'
        processed = r.get('processed', adsputils.get_date(year_zero))
        if processed is None:
            processed = adsputils.get_date(year_zero)

        is_complete = all(
            [bib_data_updated, orcid_claims_updated, nonbib_data_updated])

        if is_complete or (force is True and bib_data_updated):

            if force is False and all([
                    bib_data_updated and bib_data_updated < processed,
                    orcid_claims_updated and orcid_claims_updated < processed,
                    nonbib_data_updated and nonbib_data_updated < processed
            ]):
                logger.debug(
                    'Nothing to do for %s, it was already indexed/processed',
                    bibcode)
                continue

            if force:
                logger.debug('Forced indexing of: %s (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s)' % \
                            (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, \
                             metrics_updated))

            # build the solr record
            if update_solr:
                d = solr_updater.transform_json_record(r)
                logger.debug('Built SOLR: %s', d)
                if ignore_checksums or r.get('solr_checksum',
                                             None) != app.checksum(d):
                    batch.append(d)
                else:
                    logger.info(
                        'Checksum identical, skipping solr update for: %s',
                        bibcode)

            # get data for metrics
            if update_metrics:
                m = r.get('metrics', None)
                if (m and ignore_checksums) or (m and r.get(
                        'metrics_checksum', None) != app.checksum(m)):
                    m['bibcode'] = bibcode
                    logger.debug('Got metrics: %s', m)
                    if r.get('processed'):
                        batch_update.append(m)
                    else:
                        batch_insert.append(m)
                else:
                    logger.info(
                        'Checksum identical, skipping metrics update for: %s',
                        bibcode)

            if update_links and links_url:
                links = app.generate_links_for_resolver(r)
                if links:
                    checksum = app.checksum(links)
                    if ignore_checksums or r.get('links_checksum',
                                                 None) != checksum:
                        links_data.append(links)
        else:
            # if forced and we have at least the bib data, index it
            if force is True:
                logger.warn(
                    '%s is missing bib data, even with force=True, this cannot proceed',
                    bibcode)
            else:
                logger.debug('%s not ready for indexing yet (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s)' % \
                            (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, \
                             metrics_updated))
    if batch or batch_insert or batch_update or links_data:
        app.update_remote_targets(solr=batch,
                                  metrics=(batch_insert, batch_update),
                                  links=links_data,
                                  commit_solr=commit)