def nonbib_to_master_pipeline(nonbib_engine, schema, batch_size=1): """send all nonbib data to queue for delivery to master pipeline""" global config Session = sessionmaker(bind=nonbib_engine) session = Session() session.execute('set search_path to {}'.format(schema)) tmp = [] i = 0 max_rows = config['MAX_ROWS'] q = session.query(models.NonBibTable).options( load_only(*nonbib_to_master_select_fields)) for current_row in q.yield_per(100): current_row = nonbib_to_master_dict(current_row) add_data_link(session, current_row) cleanup_for_master(current_row) rec = NonBibRecord(**current_row) tmp.append(rec._data) i += 1 if max_rows > 0 and i >= max_rows: break if len(tmp) >= batch_size: recs = NonBibRecordList() recs.nonbib_records.extend(tmp) tmp = [] logger.info("Calling 'app.forward_message' count = '%s'", i) task_output_results.delay(recs) if len(tmp) > 0: recs = NonBibRecordList() recs.nonbib_records.extend(tmp) logger.info("Calling 'app.forward_message' with count = '%s'", i) task_output_results.delay(recs) session.close()
def test_task_update_record_delete(self): for x, cls in (('fulltext', FulltextUpdate), ('orcid_claims', OrcidClaims)): self.app.update_storage('bibcode', x, {'foo': 'bar'}) self.assertEquals(self.app.get_record('bibcode')[x]['foo'], 'bar') with patch('adsmp.tasks.task_index_records.delay') as next_task: tasks.task_update_record( cls(bibcode='bibcode', status='deleted')) self.assertEquals(self.app.get_record('bibcode')[x], None) self.assertTrue(self.app.get_record('bibcode')) recs = NonBibRecordList() recs.nonbib_records.extend( [NonBibRecord(bibcode='bibcode', status='deleted').data]) with patch('adsmp.tasks.task_index_records.delay') as next_task: tasks.task_update_record(recs) self.assertEquals(self.app.get_record('bibcode')['metrics'], None) self.assertTrue(self.app.get_record('bibcode')) with patch('adsmp.tasks.task_delete_documents') as next_task: tasks.task_update_record( DenormalizedRecord(bibcode='bibcode', status='deleted')) self.assertTrue(next_task.called) self.assertTrue(next_task.call_args[0], ('bibcode', ))
def diagnose_nonbib(): """send hard coded nonbib data the master pipeline useful for testing to verify connectivity""" test_data = { 'bibcode': '2003ASPC..295..361M', 'simbad_objects': [], 'grants': ['g'], 'boost': 0.11, 'citation_count': 0, 'read_count': 2, 'readers': ['a', 'b'], 'reference': ['c', 'd'] } recs = NonBibRecordList() rec = NonBibRecord(**test_data) recs.nonbib_records.extend([rec._data]) print 'sending nonbib data for bibocde', test_data[ 'bibcode'], 'to master pipeline' print 'using CELERY_BROKER', config['CELERY_BROKER'] print ' CELERY_DEFAULT_EXCHANGE', config['CELERY_DEFAULT_EXCHANGE'] print ' CELERY_DEFAULT_EXCHANGE_TYPE', config[ 'CELERY_DEFAULT_EXCHANGE_TYPE'] print ' OUTPUT_CELERY_BROKER', config['OUTPUT_CELERY_BROKER'] print ' OUTPUT_TASKNAME', config['OUTPUT_TASKNAME'] print 'this action did not use ingest database (configured at', config[ 'INGEST_DATABASE'], ')' print ' or the metrics database (at', config['METRICS_DATABASE'], ')' tasks.task_output_results.delay(recs)
def process_bibcodes(self, bibcodes): """send nonbib and metrics records to master for the passed bibcodes for each bibcode read nonbib data from files, generate nonbib protobuf compute metrics, generate protobuf""" # batch up messages to master for improved performance nonbib_protos = NonBibRecordList() metrics_protos = MetricsRecordList() for bibcode in bibcodes: try: nonbib = self._read_next_bibcode(bibcode) converted = self._convert(nonbib) nonbib_proto = NonBibRecord(**converted) nonbib_protos.nonbib_records.extend([nonbib_proto._data]) if self.compute_metrics: met = self._compute_metrics(nonbib) metrics_proto = MetricsRecord(**met) metrics_protos.metrics_records.extend( [metrics_proto._data]) except Exception as e: self.logger.error( 'serious error in process.process_bibcodes for bibcode {}, error {}' .format(bibcode, e)) self.logger.exception('general stacktrace') tasks.task_output_nonbib.delay(nonbib_protos) tasks.task_output_metrics.delay(metrics_protos)
def test_task_update_record_augments_list(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: recs = NonBibRecordList() nonbib_data = {'bibcode': '2003ASPC..295..361M', 'boost': 3.1} nonbib_data2 = {'bibcode': '3003ASPC..295..361Z', 'boost': 3.2} rec = NonBibRecord(**nonbib_data) rec2 = NonBibRecord(**nonbib_data2) recs.nonbib_records.extend([rec._data, rec2._data]) tasks.task_update_record(recs) self.assertFalse(next_task.called)
def test_task_update_record_nonbib_list(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: self.assertFalse(next_task.called) recs = NonBibRecordList() nonbib_data = {'bibcode': '2003ASPC..295..361M', 'refereed': False} nonbib_data2 = {'bibcode': '3003ASPC..295..361Z', 'refereed': True} rec = NonBibRecord(**nonbib_data) rec2 = NonBibRecord(**nonbib_data2) recs.nonbib_records.extend([rec._data, rec2._data]) tasks.task_update_record(recs) self.assertTrue(next_task.called) self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S', '3003ASPC..295..361Z'))
def nonbib_delta_to_master_pipeline(nonbib_engine, schema, batch_size=1): """send data for changed bibcodes to master pipeline the delta table was computed by comparing to sets of nonbib data perhaps ingested on succesive days""" global config Session = sessionmaker(bind=nonbib_engine) session = Session() session.execute('set search_path to {}'.format(schema)) tmp = [] i = 0 n = nonbib.NonBib(schema) max_rows = config['MAX_ROWS'] for current_delta in session.query(models.NonBibDeltaTable).yield_per(100): row = n.get_by_bibcode(nonbib_engine, current_delta.bibcode, nonbib_to_master_select_fields) row = nonbib_to_master_dict(row) add_data_link(session, row) cleanup_for_master(row) rec = NonBibRecord(**row) tmp.append(rec._data) i += 1 if max_rows > 0 and i > max_rows: break if len(tmp) >= batch_size: recs = NonBibRecordList() recs.nonbib_records.extend(tmp) tmp = [] logger.debug("Calling 'app.forward_message' with '%s' items", len(recs.nonbib_records)) task_output_results.delay(recs) if len(tmp) > 0: recs = NonBibRecordList() recs.nonbib_records.extend(tmp) logger.debug("Calling 'app.forward_message' with final '%s' items", len(recs.nonbib_records)) task_output_results.delay(recs)
def nonbib_bibs_to_master_pipeline(nonbib_engine, schema, bibcodes): """send data for the passed bibcodes to master""" Session = sessionmaker(bind=nonbib_engine) session = Session() session.execute('set search_path to {}'.format(schema)) n = nonbib.NonBib(schema) tmp = [] for bibcode in bibcodes: row = n.get_by_bibcode(nonbib_engine, bibcode, nonbib_to_master_select_fields) if row: row = nonbib_to_master_dict(row) add_data_link(session, row) cleanup_for_master(row) rec = NonBibRecord(**row) tmp.append(rec._data) else: print 'unknown bibcode ', bibcode recs = NonBibRecordList() recs.nonbib_records.extend(tmp) logger.debug("Calling 'app.forward_message' for '%s' bibcodes", len(recs.nonbib_records)) task_output_results.delay(recs)