def test_send_email(self): email_addr = '*****@*****.**' payload_plain = 'plain test' payload_html = '<em>html test</em>' with patch('smtplib.SMTP') as mock_smtp: msg = utils.send_email(email_addr, email_template=myADSTemplate, payload_plain=payload_plain, payload_html=payload_html) self.assertTrue( payload_plain in msg.get_payload()[0].get_payload()) self.assertTrue(payload_html in msg.get_payload()[1].get_payload()) self.assertTrue(myADSTemplate.subject == msg.get('subject'))
def process_myads(since=None, user_ids=None, user_emails=None, test_send_to=None, admin_email=None, force=False, frequency='daily', test_bibcode=None, **kwargs): """ Processes myADS mailings :param since: check for new myADS users since this date :param user_ids: users to process claims for, else all users - list (given as adsws IDs) :param user_emails: users to process claims for, else all users - list (given as email addresses) :param test_send_to: for testing; process a given user ID but send the output to this email address :param admin_email: if provided, email is sent to this address at beginning and end of processing (does not trigger for processing for individual users) :param force: if True, will force processing of emails even if sent for a given user already that day :param frequency: basestring; 'daily' or 'weekly' :param test_bibcode: bibcode to query to test if Solr searcher has been updated :return: no return """ if user_ids: for u in user_ids: tasks.task_process_myads({'userid': u, 'frequency': frequency, 'force': True, 'test_send_to': test_send_to, 'test_bibcode': test_bibcode}) logger.info('Done (just the supplied user IDs)') return if user_emails: for u in user_emails: r = app.client.get(config.get('API_ADSWS_USER_EMAIL') % u, headers={'Accept': 'application/json', 'Authorization': 'Bearer {0}'.format(config.get('API_TOKEN'))} ) if r.status_code == 200: user_id = r.json()['id'] else: logger.warning('Error getting user ID with email {0} from the API. Processing aborted for this user'.format(u)) continue tasks.task_process_myads({'userid': user_id, 'frequency': frequency, 'force': True, 'test_send_to': test_send_to, 'test_bibcode': test_bibcode}) logger.info('Done (just the supplied user IDs)') return logging.captureWarnings(True) if admin_email: msg = utils.send_email(email_addr=admin_email, payload_plain='Processing started for {}'.format(get_date()), payload_html='Processing started for {}'.format(get_date()), subject='myADS {0} processing has started'.format(frequency)) # if since keyword not provided, since is set to timestamp of last processing if not since or isinstance(since, basestring) and since.strip() == "": with app.session_scope() as session: if frequency == 'daily': kv = session.query(KeyValue).filter_by(key='last.process.daily').first() else: kv = session.query(KeyValue).filter_by(key='last.process.weekly').first() if kv is not None: since = kv.value else: since = '1971-01-01T12:00:00Z' users_since_date = get_date(since) logger.info('Processing {0} myADS queries since: {1}'.format(frequency, users_since_date.isoformat())) last_process_date = get_date() all_users = app.get_users(users_since_date.isoformat()) for user in all_users: try: tasks.task_process_myads.delay({'userid': user, 'frequency': frequency, 'force': force, 'test_bibcode': test_bibcode}) except: # potential backpressure (we are too fast) time.sleep(2) print 'Conn problem, retrying...', user tasks.task_process_myads.delay({'userid': user, 'frequency': frequency, 'force': force, 'test_bibcode': test_bibcode}) # update last processed timestamp with app.session_scope() as session: if frequency == 'daily': kv = session.query(KeyValue).filter_by(key='last.process.daily').first() else: kv = session.query(KeyValue).filter_by(key='last.process.weekly').first() if kv is None: if frequency == 'daily': kv = KeyValue(key='last.process.daily', value=last_process_date.isoformat()) else: kv = KeyValue(key='last.process.weekly', value=last_process_date.isoformat()) session.add(kv) else: kv.value = last_process_date.isoformat() session.commit() print 'Done submitting {0} myADS processing tasks for {1} users.'.format(frequency, len(all_users)) logger.info('Done submitting {0} myADS processing tasks for {1} users.'.format(frequency, len(all_users)))
arxiv_complete = _arxiv_ingest_complete(sleep_delay=300, sleep_timeout=36000) except Exception as e: logger.warning('arXiv ingest: code failed with an exception: {0}'.format(e)) if arxiv_complete: logger.info('arxiv ingest: complete') if args.wait_send: logger.info('arxiv ingest: waiting {0} seconds for all SOLR searchers to sync data'.format(args.wait_send)) time.sleep(args.wait_send) logger.info('arxiv ingest: starting processing') process_myads(args.since_date, args.user_ids, args.user_emails, args.test_send_to, args.admin_email, args.force, frequency='daily', test_bibcode=arxiv_complete) else: logger.warning('arXiv ingest: failed.') if args.admin_email: msg = utils.send_email(email_addr=args.admin_email, payload_plain='Error in the arXiv ingest', payload_html='Error in the arXiv ingest', subject='arXiv ingest failed') if args.weekly_update: astro_complete = False try: astro_complete = _astro_ingest_complete(sleep_delay=300, sleep_timeout=36000) except Exception as e: logger.warning('astro ingest: code failed with an exception: {0}'.format(e)) if astro_complete: logger.info('astro ingest: complete') if args.wait_send: logger.info('astro ingest: waiting {0} seconds for all SOLR searchers to sync data'.format(args.wait_send)) time.sleep(args.wait_send) logger.info('astro ingest: starting processing now') process_myads(args.since_date, args.user_ids, args.user_emails, args.test_send_to, args.admin_email, args.force, frequency='weekly', test_bibcode=astro_complete)
def task_process_myads(message): """ Process the myADS notifications for a given user :param message: contains the message inside the packet { 'userid': adsws user ID, 'frequency': 'daily' or 'weekly', 'force': Boolean (if present, we'll reprocess myADS notifications for the user, even if they were already processed today) 'test_send_to': email address to send output to, if not that of the user (for testing) 'retries': number of retries attempted } :return: no return """ if 'userid' not in message: logger.error('No user ID received for {0}'.format(message)) return if 'frequency' not in message: logger.error('No frequency received for {0}'.format(message)) return userid = message['userid'] with app.session_scope() as session: try: q = session.query(AuthorInfo).filter_by(id=userid).one() last_sent = q.last_sent except ormexc.NoResultFound: author = AuthorInfo(id=userid, created=adsputils.get_date(), last_sent=None) session.add(author) session.flush() last_sent = author.last_sent session.commit() if last_sent and last_sent.date() == adsputils.get_date().date(): # already sent email today if not message['force']: logger.warning( 'Email for user {0} already sent today'.format(userid)) return else: logger.info( 'Email for user {0} already sent today, but force mode is on' .format(userid)) # first fetch the myADS setup from /vault/get-myads r = app.client.get(app.conf.get('API_VAULT_MYADS_SETUP') % userid, headers={ 'Accept': 'application/json', 'Authorization': 'Bearer {0}'.format(app.conf.get('API_TOKEN')) }) if r.status_code != 200: if message.get('retries', None): retries = message['retries'] else: retries = 0 if retries < app.conf.get('TOTAL_RETRIES', 3): message['retries'] = retries + 1 task_process_myads.apply_async(args=(message, ), countdown=app.conf.get( 'MYADS_RESEND_WINDOW', 3600)) logger.warning( 'Failed getting myADS setup for {0}; will try again later. Retry {1}' .format(userid, retries)) return else: logger.warning( 'Maximum number of retries attempted for {0}. myADS processing failed.' .format(userid)) return if message.get('test_bibcode', None): # check that the solr searcher we're getting is still ok by querying for the test bibcode q = app.client.get( '{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format( app.conf.get('API_SOLR_QUERY_ENDPOINT'), message.get('test_bibcode')), headers={'Authorization': 'Bearer ' + app.conf.get('API_TOKEN')}) fail = True if q.status_code != 200: logger.warning( 'Error retrieving the test bibcode {0} from solr while processing for user {1}. Retrying' .format(message.get('test_bibcode'), userid)) elif q.json()['response']['numFound'] == 0: logger.warning( 'Test bibcode {0} not found in solr while processing for user {1}. Retrying' .format(message.get('test_bibcode'), userid)) else: fail = False if fail: if message.get('solr_retries', None): retries = message['solr_retries'] else: retries = 0 if retries < app.conf.get('TOTAL_RETRIES', 3): message['solr_retries'] = retries + 1 task_process_myads.apply_async(args=(message, ), countdown=app.conf.get( 'MYADS_SOLR_RESEND_WINDOW', 3600)) logger.warning( 'Solr error occurred while processing myADS email for user {0}; rerunning. Retry {1}' .format(userid, retries)) return else: logger.warning( 'Maximum number of retries attempted for {0}. myADS processing failed: ' 'solr searchers were not updated.'.format(userid)) return # then execute each qid /vault/execute-query/qid setup = r.json() payload = [] for s in setup: if s['frequency'] == message['frequency']: # only return 5 results, unless it's the daily arXiv posting, then return max # TODO should all stateful queries return all results or will this be overwhelming for some? well-cited # users can get 40+ new cites in one weekly astro update if s['frequency'] == 'daily' and s['stateful'] is False: s['rows'] = 2000 else: s['rows'] = 5 s['fields'] = 'bibcode,title,author_norm,identifier,year,bibstem' if s['type'] == 'query': qtype = 'general' try: raw_results = utils.get_query_results(s) except RuntimeError: if message.get('query_retries', None): retries = message['query_retries'] else: retries = 0 if retries < app.conf.get('TOTAL_RETRIES', 3): message['query_retries'] = retries + 1 logger.warning( 'Error getting query results for user {0}. Retrying. Retry: {1}' .format(userid, retries)) task_process_myads.apply_async( args=(message, ), countdown=app.conf.get('MYADS_RESEND_WINDOW', 3600)) return else: logger.warning( 'Maximum number of query retries attempted for user {0}; myADS processing ' 'failed due to retrieving query results failures.'. format(userid)) elif s['type'] == 'template': qtype = s['template'] try: raw_results = utils.get_template_query_results(s) except RuntimeError: if message.get('query_retries', None): retries = message['query_retries'] else: retries = 0 if retries < app.conf.get('TOTAL_RETRIES', 3): message['query_retries'] = retries + 1 logger.warning( 'Error getting template query results for user {0}. Retrying. ' 'Retry:'.format(userid, retries)) task_process_myads.apply_async( args=(message, ), countdown=app.conf.get('MYADS_RESEND_WINDOW', 3600)) return else: logger.warning( 'Maximum number of query retries attempted for user {0}; myADS processing ' 'failed due to retrieving query results failures.'. format(userid)) else: logger.warning( 'Wrong query type passed for query {0}, user {1}'.format( s, userid)) pass for r in raw_results: # for stateful queries, remove previously seen results, store new results if s['stateful']: docs = r['results'] bibcodes = [doc['bibcode'] for doc in docs] if s.get('qid', None): good_bibc = app.get_recent_results( user_id=userid, qid=s['qid'], input_results=bibcodes, ndays=app.conf.get('STATEFUL_RESULTS_DAYS', 7)) else: good_bibc = app.get_recent_results( user_id=userid, setup_id=s['id'], input_results=bibcodes, ndays=app.conf.get('STATEFUL_RESULTS_DAYS', 7)) results = [ doc for doc in docs if doc['bibcode'] in good_bibc ] else: results = r['results'] payload.append({ 'name': r['name'], 'query_url': r['query_url'], 'results': results, 'query': r['query'], 'qtype': qtype, 'id': s['id'] }) else: # wrong frequency for this round of processing pass if len(payload) == 0: logger.info( 'No payload for user {0} for the {1} email. No email was sent.'. format(userid, message['frequency'])) return # if test email address provided, send there; otherwise fetch user email address if message.get('test_send_to', None): email = message.get('test_send_to') else: email = utils.get_user_email(userid=userid) if message['frequency'] == 'daily': subject = 'Daily arXiv myADS Notification' else: subject = 'Weekly myADS Notification' payload_plain = utils.payload_to_plain(payload) if len(payload) < app.conf.get('NUM_QUERIES_TWO_COL', 3): payload_html = utils.payload_to_html(payload, col=1, frequency=message['frequency'], email_address=email) else: payload_html = utils.payload_to_html(payload, col=2, frequency=message['frequency'], email_address=email) msg = utils.send_email(email_addr=email, email_template=myADSTemplate, payload_plain=payload_plain, payload_html=payload_html, subject=subject) if msg: # update author table w/ last sent datetime with app.session_scope() as session: q = session.query(AuthorInfo).filter_by(id=userid).one() q.last_sent = adsputils.get_date() session.commit() else: if message.get('send_retries', None): retries = message['send_retries'] else: retries = 0 if retries < app.conf.get('TOTAL_RETRIES', 3): message['send_retries'] = retries + 1 task_process_myads.apply_async(args=(message, ), countdown=app.conf.get( 'MYADS_RESEND_WINDOW', 3600)) logger.warning( 'Error sending myADS email for user {0}, email {1}; rerunning. Retry {2}' .format(userid, email, retries)) return else: logger.warning( 'Maximum number of retries attempted for {0}. myADS processing failed at sending the email.' .format(userid)) return
def _arxiv_ingest_complete(date=None, sleep_delay=60, sleep_timeout=7200, admin_email=None): """ Check if new arXiv records are in Solr - run before running myADS processing :param date: date to check arXiv records for; default is set by days-delta from today in config (times in local time) :param sleep_delay: number of seconds to sleep between retries :param sleep_timeout: number of seconds to retry in total before timing out completely :return: test bibcode or None """ if not date: date = (datetime.datetime.today() - datetime.timedelta(days=config.get('ARXIV_TIMEDELTA_DAYS'))).strftime('%Y-%m-%d') else: date = get_date(date).strftime('%Y-%m-%d') arxiv_file = os.path.join(config.get('ARXIV_INCOMING_ABS_DIR'), date, 'new_records.tsv') arxiv_records = [] try: with open(arxiv_file, 'rt') as flist: for l in flist.readlines(): # sample line: 2012.14424 oai/arXiv.org/2012/14424 arxiv_records.append(l.split()[0]) except IOError: logger.warning('arXiv ingest file not found. Exiting.') if admin_email: msg_text = 'The arXiv ingest file for today was not found.' msg = utils.send_email(email_addr=admin_email, payload_plain=msg_text, payload_html=msg_text, subject='arXiv ingest failed') return None arxiv_records.sort() # get most recent arXiv id to test ingest later last_id = arxiv_records.pop() total_delay = 0 while total_delay < sleep_timeout: total_delay += sleep_delay r = app.client.get('{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format(config.get('API_SOLR_QUERY_ENDPOINT'), last_id), headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')}) if r.status_code != 200: time.sleep(sleep_delay) logger.error('Error retrieving record for {0} from Solr ({1} {2}), retrying'. format(last_id, r.status_code, r.text)) continue numfound = r.json()['response']['numFound'] if numfound == 0: # nothing found, try again after a sleep time.sleep(sleep_delay) logger.info('arXiv ingest not complete (test arXiv id: {0}). Sleeping {1}s, for a total delay of {2}s.' .format(last_id, sleep_delay, total_delay)) continue if numfound > 1: # returning this as true for now, since technically something was found logger.error('Too many records returned for id {0}'.format(last_id)) logger.info('Numfound: {0} for test id {1}. Response: {2}. URL: {3}'.format(numfound, last_id, json.dumps(r.json()), r.url)) # check number of bibcodes from ingest if get_date().weekday() == 0: start_date = (get_date() - datetime.timedelta(days=3)).date() else: start_date = (get_date() - datetime.timedelta(days=1)).date() beg_pubyear = (get_date() - datetime.timedelta(days=180)).year q = app.client.get('{0}?q={1}'.format(config.get('API_SOLR_QUERY_ENDPOINT'), quote_plus('bibstem:arxiv entdate:["{0}Z00:00" TO NOW] ' 'pubdate:[{1}-00 TO *]'.format(start_date, beg_pubyear))), headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')}) logger.info('Total number of arXiv bibcodes ingested: {}'.format(q.json()['response']['numFound'])) return last_id logger.warning('arXiv ingest did not complete within the {0}s timeout limit. Exiting.'.format(sleep_timeout)) if admin_email: msg_text = 'The arXiv ingest was not replicated to production Solr before timeout reached.' msg = utils.send_email(email_addr=admin_email, payload_plain=msg_text, payload_html=msg_text, subject='arXiv ingest failed') return None
def _astro_ingest_complete(date=None, sleep_delay=60, sleep_timeout=7200, admin_email=None): """ Check if new astronomy records are in Solr; run before weekly processing :param date: check to check against astronomy bibcode list last updated date :param sleep_delay: number of seconds to sleep between retries :param sleep_timeout: number of seconds to retry in total before timing out completely :return: test bibcode or None """ if not date: date = (datetime.datetime.today() - datetime.timedelta(days=config.get('ASTRO_TIMEDELTA_DAYS'))) else: date = get_date(date) astro_file = config.get('ASTRO_INCOMING_DIR') + 'matches.input' # make sure file is present and check modified datestamp on file - should be recent (otherwise contains old data) try: mod_date = datetime.datetime.fromtimestamp(os.path.getmtime(astro_file)) except OSError: mod_date = None # if the file is old or missing, sleep until the file is present and updated if not mod_date or mod_date < date: total_delay = 0 while total_delay < sleep_timeout: total_delay += sleep_delay time.sleep(sleep_delay) try: mod_date = datetime.datetime.fromtimestamp(os.path.getmtime(astro_file)) except OSError: mod_date = None if mod_date and mod_date > date: break else: # timeout reached before astronomy update completed logger.warning('Astronomy update did not complete within the {0}s timeout limit. Exiting.'.format(sleep_timeout)) if admin_email: msg_text = 'Astronomy ingest file has not been updated recently, and timeout limit for waiting for ' \ 'update has been reached.' msg = utils.send_email(email_addr=admin_email, payload_plain=msg_text, payload_html=msg_text, subject='Astronomy ingest failed') return None # make sure the ingest file exists and has enough bibcodes total_delay = 0 while total_delay < sleep_timeout: astro_records = [] try: with open(astro_file, 'rt') as flist: for l in flist.readlines(): # sample line: 2019A&A...632A..94J K58-37447 astro_records.append(l.split()[0]) except IOError: time.sleep(sleep_delay) total_delay += sleep_delay logger.warning('Error opening astronomy ingest file. Sleeping {0}s, for a total delay of {1}s'. format(sleep_delay, total_delay)) continue if len(astro_records) < 10: time.sleep(sleep_delay) total_delay += sleep_delay logger.warning('Astronomy ingest file too small - ingest not complete. Sleeping {0}s, for a total delay of {1}s'. format(sleep_delay, total_delay)) continue else: break else: if admin_email: msg_text = 'The astronomy ingest file has been updated recently, but there was an error opening it ' \ 'and/or it has very few records. The timeout limit for waiting for the update has been reached.' msg = utils.send_email(email_addr=admin_email, payload_plain=msg_text, payload_html=msg_text, subject='Astronomy ingest failed') return None # get several randomly selected bibcodes, in case one had ingest issues sample = random.sample(astro_records, config.get('ASTRO_SAMPLE_SIZE')) # check that the astronomy records have made it into solr total_delay = 0 while total_delay < sleep_timeout: num_sampled = 0 for s in sample: num_sampled += 1 r = app.client.get('{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format(config.get('API_SOLR_QUERY_ENDPOINT'), s), headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')}) # if there's a solr error, sleep then move to the next bibcode if r.status_code != 200: time.sleep(sleep_delay) total_delay += sleep_delay logger.error('Error retrieving bibcode {0} from Solr ({1} {2}), sleeping {3}s, for a total delay of {4}s'. format(s, r.status_code, r.text, sleep_delay, total_delay)) continue numfound = r.json()['response']['numFound'] if numfound == 0: # nothing found - if all bibcodes in the sample were tried, sleep then start the while loop again if num_sampled == config.get('ASTRO_SAMPLE_SIZE'): time.sleep(sleep_delay) total_delay += sleep_delay logger.warning('Astronomy ingest not complete for all in sample (sample: {0}). Sleeping {1}s, for a total delay of {2}s.' .format(sample, sleep_delay, total_delay)) # if we haven't tried the others in the same, try the rest else: logger.info( 'Astronomy ingest not complete (test astro bibcode: {0}). Trying the next in the sample.' .format(s)) continue elif numfound > 1: # returning this as true for now, since technically something was found logger.error('Too many records returned for bibcode {0}'.format(s)) logger.info('Numfound: {0} for test bibcode {1}. Response: {2}. URL: {3}'.format(numfound, s, json.dumps(r.json()), r.url)) return s logger.warning('Astronomy ingest did not complete within the {0}s timeout limit. Exiting.'.format(sleep_timeout)) if admin_email: msg_text = 'The astronomy ingest was not replicated to production Solr before timeout reached.' msg = utils.send_email(email_addr=admin_email, payload_plain=msg_text, payload_html=msg_text, subject='Astronomy ingest failed') return None