Пример #1
0
 def test_task_update_record_augments_list(self):
     with patch('adsmp.tasks.task_index_records.delay') as next_task:
         recs = NonBibRecordList()
         nonbib_data = {'bibcode': '2003ASPC..295..361M', 'boost': 3.1}
         nonbib_data2 = {'bibcode': '3003ASPC..295..361Z', 'boost': 3.2}
         rec = NonBibRecord(**nonbib_data)
         rec2 = NonBibRecord(**nonbib_data2)
         recs.nonbib_records.extend([rec._data, rec2._data])
         tasks.task_update_record(recs)
         self.assertFalse(next_task.called)
Пример #2
0
 def test_task_update_record_nonbib_list(self):
     with patch('adsmp.tasks.task_index_records.delay') as next_task:
         self.assertFalse(next_task.called)
         recs = NonBibRecordList()
         nonbib_data = {'bibcode': '2003ASPC..295..361M', 'refereed': False}
         nonbib_data2 = {'bibcode': '3003ASPC..295..361Z', 'refereed': True}
         rec = NonBibRecord(**nonbib_data)
         rec2 = NonBibRecord(**nonbib_data2)
         recs.nonbib_records.extend([rec._data, rec2._data])
         tasks.task_update_record(recs)
         self.assertTrue(next_task.called)
         self.assertTrue(next_task.call_args[0],
                         ('2015ApJ...815..133S', '3003ASPC..295..361Z'))
Пример #3
0
    def test_task_update_record_delete(self):

        for x, cls in (('fulltext', FulltextUpdate), ('orcid_claims',
                                                      OrcidClaims)):
            self.app.update_storage('bibcode', x, {'foo': 'bar'})
            self.assertEquals(self.app.get_record('bibcode')[x]['foo'], 'bar')
            with patch('adsmp.tasks.task_index_records.delay') as next_task:
                tasks.task_update_record(
                    cls(bibcode='bibcode', status='deleted'))
                self.assertEquals(self.app.get_record('bibcode')[x], None)
                self.assertTrue(self.app.get_record('bibcode'))

        recs = NonBibRecordList()
        recs.nonbib_records.extend(
            [NonBibRecord(bibcode='bibcode', status='deleted').data])
        with patch('adsmp.tasks.task_index_records.delay') as next_task:
            tasks.task_update_record(recs)
            self.assertEquals(self.app.get_record('bibcode')['metrics'], None)
            self.assertTrue(self.app.get_record('bibcode'))

        with patch('adsmp.tasks.task_delete_documents') as next_task:
            tasks.task_update_record(
                DenormalizedRecord(bibcode='bibcode', status='deleted'))
            self.assertTrue(next_task.called)
            self.assertTrue(next_task.call_args[0], ('bibcode', ))
Пример #4
0
def diagnose_nonbib():
    """send hard coded nonbib data the master pipeline

    useful for testing to verify connectivity"""

    test_data = {
        'bibcode': '2003ASPC..295..361M',
        'simbad_objects': [],
        'grants': ['g'],
        'boost': 0.11,
        'citation_count': 0,
        'read_count': 2,
        'readers': ['a', 'b'],
        'reference': ['c', 'd']
    }
    recs = NonBibRecordList()
    rec = NonBibRecord(**test_data)
    recs.nonbib_records.extend([rec._data])
    print 'sending nonbib data for bibocde', test_data[
        'bibcode'], 'to master pipeline'
    print 'using CELERY_BROKER', config['CELERY_BROKER']
    print '  CELERY_DEFAULT_EXCHANGE', config['CELERY_DEFAULT_EXCHANGE']
    print '  CELERY_DEFAULT_EXCHANGE_TYPE', config[
        'CELERY_DEFAULT_EXCHANGE_TYPE']
    print '  OUTPUT_CELERY_BROKER', config['OUTPUT_CELERY_BROKER']
    print '  OUTPUT_TASKNAME', config['OUTPUT_TASKNAME']
    print 'this action did not use ingest database (configured at', config[
        'INGEST_DATABASE'], ')'
    print '  or the metrics database (at', config['METRICS_DATABASE'], ')'
    tasks.task_output_results.delay(recs)
Пример #5
0
 def test_task_update_record_nonbib(self):
     with patch('adsmp.tasks.task_index_records.delay') as next_task:
         self.assertFalse(next_task.called)
         tasks.task_update_record(
             NonBibRecord(bibcode='2015ApJ...815..133S'))
         self.assertTrue(next_task.called)
         self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S'))
Пример #6
0
    def process_bibcodes(self, bibcodes):
        """send nonbib and metrics records to master for the passed bibcodes

        for each bibcode
            read nonbib data from files, generate nonbib protobuf
            compute metrics, generate protobuf"""
        # batch up messages to master for improved performance
        nonbib_protos = NonBibRecordList()
        metrics_protos = MetricsRecordList()

        for bibcode in bibcodes:
            try:
                nonbib = self._read_next_bibcode(bibcode)
                converted = self._convert(nonbib)
                nonbib_proto = NonBibRecord(**converted)
                nonbib_protos.nonbib_records.extend([nonbib_proto._data])
                if self.compute_metrics:
                    met = self._compute_metrics(nonbib)
                    metrics_proto = MetricsRecord(**met)
                    metrics_protos.metrics_records.extend(
                        [metrics_proto._data])
            except Exception as e:
                self.logger.error(
                    'serious error in process.process_bibcodes for bibcode {}, error {}'
                    .format(bibcode, e))
                self.logger.exception('general stacktrace')
        tasks.task_output_nonbib.delay(nonbib_protos)
        tasks.task_output_metrics.delay(metrics_protos)
Пример #7
0
def nonbib_to_master_pipeline(nonbib_engine, schema, batch_size=1):
    """send all nonbib data to queue for delivery to master pipeline"""
    global config
    Session = sessionmaker(bind=nonbib_engine)
    session = Session()
    session.execute('set search_path to {}'.format(schema))
    tmp = []
    i = 0
    max_rows = config['MAX_ROWS']
    q = session.query(models.NonBibTable).options(
        load_only(*nonbib_to_master_select_fields))
    for current_row in q.yield_per(100):
        current_row = nonbib_to_master_dict(current_row)
        add_data_link(session, current_row)
        cleanup_for_master(current_row)
        rec = NonBibRecord(**current_row)
        tmp.append(rec._data)
        i += 1
        if max_rows > 0 and i >= max_rows:
            break
        if len(tmp) >= batch_size:
            recs = NonBibRecordList()
            recs.nonbib_records.extend(tmp)
            tmp = []
            logger.info("Calling 'app.forward_message' count = '%s'", i)
            task_output_results.delay(recs)

    if len(tmp) > 0:
        recs = NonBibRecordList()
        recs.nonbib_records.extend(tmp)
        logger.info("Calling 'app.forward_message' with count = '%s'", i)
        task_output_results.delay(recs)
    session.close()
Пример #8
0
 def test_task_update_record_nonbib(self):
     with patch('adsmp.tasks.task_index_records.delay') as next_task:
         tasks.task_update_record(
             NonBibRecord(bibcode='2015ApJ...815..133S', read_count=9))
         self.assertEquals(
             self.app.get_record(bibcode='2015ApJ...815..133S')
             ['nonbib_data']['read_count'], 9)
         self.assertFalse(next_task.called)
Пример #9
0
 def test_protobuf(self):
     """make sure protobuf are created without an exception"""
     with Processor(compute_metrics=False) as processor, patch(
             'adsputils.load_config',
             return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
         d = processor._read_next_bibcode('1057wjlf.book.....C')
         c = processor._convert(d)
         nonbib = NonBibRecord(**c)
         print('nonbib = {}'.format(nonbib))
Пример #10
0
def _build_nonbib_record(app, citation_change, record, db_versions, status):
    doi = citation_change.content
    nonbib_record_dict = {
        'status':
        status,
        'bibcode':
        record.bibcode,
        'boost':
        0.5,  # Value between 0 and 1
        'citation_count':
        record.citation_count,
        'data': [],
        'data_links_rows': [
            {
                'link_type': 'ESOURCE',
                'link_sub_type': 'PUB_HTML',
                'url': [app.conf['DOI_URL'] + doi],
                'title': [''],
                'item_count': 0
            },
        ],  # `item_count` only used for DATA and not ESOURCES
        'citation_count_norm':
        record.citation_count_norm,
        'grants': [],
        'ned_objects': [],
        'norm_cites':
        0,  # log10-normalized count of citations computed on the classic site but not currently used
        'read_count':
        record.read_count,
        'readers': [],
        'simbad_objects': [],
        'total_link_counts':
        0  # Only used for DATA and not for ESOURCES
    }
    if db_versions not in [{"": ""}, {}, None]:
        nonbib_record_dict['data_links_rows'].append({
            'link_type':
            'ASSOCIATED',
            'link_sub_type':
            '',
            'url':
            db_versions.values(),
            'title':
            db_versions.keys(),
            'item_count':
            0
        })
    nonbib_record = NonBibRecord(**nonbib_record_dict)
    nonbib_record.esource.extend(record.esources)
    nonbib_record.reference.extend(record.reference)
    nonbib_record.property.extend(record.property)
    return nonbib_record
Пример #11
0
def task_update_record(msg):
    """Receives payload to update the record.

    @param msg: protobuff that contains at minimum
        - bibcode
        - and specific payload
    """
    logger.debug('Updating record: %s', msg)
    status = app.get_msg_status(msg)

    if status == 'deleted':
        task_delete_documents(msg.bibcode)
    elif status == 'active':
        type = app.get_msg_type(msg)
        bibcodes = []

        # save into a database
        # passed msg may contain details on one bibcode or a list of bibcodes
        if type == 'nonbib_records':
            for m in msg.nonbib_records:
                m = NonBibRecord.deserializer(m.SerializeToString())
                t = app.get_msg_type(m)
                bibcodes.append(m.bibcode)
                record = app.update_storage(m.bibcode, t, m.toJSON())
                logger.debug('Saved record from list: %s', record)
        elif type == 'metrics_records':
            for m in msg.metrics_records:
                m = MetricsRecord.deserializer(m.SerializeToString())
                t = app.get_msg_type(m)
                bibcodes.append(m.bibcode)
                record = app.update_storage(m.bibcode, t, m.toJSON())
                logger.debug('Saved record from list: %s', record)
        else:
            # here when record has a single bibcode
            bibcodes.append(msg.bibcode)
            record = app.update_storage(msg.bibcode, type, msg.toJSON())
            logger.debug('Saved record: %s', record)

        # trigger futher processing
        task_index_records.delay(bibcodes)
    else:
        logger.error('Received a message with unclear status: %s', msg)
Пример #12
0
def nonbib_delta_to_master_pipeline(nonbib_engine, schema, batch_size=1):
    """send data for changed bibcodes to master pipeline

    the delta table was computed by comparing to sets of nonbib data
    perhaps ingested on succesive days"""
    global config
    Session = sessionmaker(bind=nonbib_engine)
    session = Session()
    session.execute('set search_path to {}'.format(schema))
    tmp = []
    i = 0
    n = nonbib.NonBib(schema)
    max_rows = config['MAX_ROWS']
    for current_delta in session.query(models.NonBibDeltaTable).yield_per(100):
        row = n.get_by_bibcode(nonbib_engine, current_delta.bibcode,
                               nonbib_to_master_select_fields)
        row = nonbib_to_master_dict(row)
        add_data_link(session, row)
        cleanup_for_master(row)
        rec = NonBibRecord(**row)
        tmp.append(rec._data)
        i += 1
        if max_rows > 0 and i > max_rows:
            break
        if len(tmp) >= batch_size:
            recs = NonBibRecordList()
            recs.nonbib_records.extend(tmp)
            tmp = []
            logger.debug("Calling 'app.forward_message' with '%s' items",
                         len(recs.nonbib_records))
            task_output_results.delay(recs)

    if len(tmp) > 0:
        recs = NonBibRecordList()
        recs.nonbib_records.extend(tmp)
        logger.debug("Calling 'app.forward_message' with final '%s' items",
                     len(recs.nonbib_records))
        task_output_results.delay(recs)
Пример #13
0
def nonbib_bibs_to_master_pipeline(nonbib_engine, schema, bibcodes):
    """send data for the passed bibcodes to master"""
    Session = sessionmaker(bind=nonbib_engine)
    session = Session()
    session.execute('set search_path to {}'.format(schema))
    n = nonbib.NonBib(schema)
    tmp = []
    for bibcode in bibcodes:
        row = n.get_by_bibcode(nonbib_engine, bibcode,
                               nonbib_to_master_select_fields)
        if row:
            row = nonbib_to_master_dict(row)
            add_data_link(session, row)
            cleanup_for_master(row)
            rec = NonBibRecord(**row)
            tmp.append(rec._data)
        else:
            print 'unknown bibcode ', bibcode
    recs = NonBibRecordList()
    recs.nonbib_records.extend(tmp)
    logger.debug("Calling 'app.forward_message' for '%s' bibcodes",
                 len(recs.nonbib_records))
    task_output_results.delay(recs)