Exemplos de transform_json_record em Python, exemplos de adsmp.solr_updater.transform_json_record em Python

Exemplo n.º 1

0

Exibir arquivo

def _print_record(bibcode):
    with app.session_scope() as session:
        print 'stored by us:', bibcode
        r = session.query(Records).filter_by(bibcode=bibcode).first()
        if r:
            print json.dumps(r.toJSON(), indent=2, default=str, sort_keys=True)
        else:
            print 'None'
        print '-' * 80

        print 'as seen by SOLR'
        solr_doc = solr_updater.transform_json_record(r.toJSON())
        print json.dumps(solr_doc, indent=2, default=str, sort_keys=True)
        print '=' * 80

Exemplo n.º 2

0

Exibir arquivo

Arquivo: tasks.py Projeto: aaccomazzi/ADSMasterPipeline

def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True, update_links=True, commit=False,
                       ignore_checksums=False, solr_targets=None, update_timestamps=True):
    """Receives the bibcode of a document that was updated.
    (note: we could have sent the full record however we don't
    do it because the messages might be delayed and we can have
    multiple workers updating the same record; so we want to
    look into the database and get the most recent version)


    Receives bibcodes and checks the database if we have all the
    necessary pieces to push to solr. If not, then postpone and
    push later.

    We consider a record to be 'ready' if those pieces were updated
    (and were updated later than the last 'processed' timestamp):

        - bib_data
        - nonbib_data
        - orcid_claims

    'fulltext' is not considered essential; but updates to fulltext will
    trigger a solr_update (so it might happen that a document will get
    indexed twice; first with only metadata and later on incl fulltext)
    """

    if isinstance(bibcodes, basestring):
        bibcodes = [bibcodes]

    if not (update_solr or update_metrics or update_links):
        raise Exception('Hmmm, I dont think I let you do NOTHING, sorry!')

    logger.debug('Running index-records for: %s', bibcodes)
    batch = []
    batch_insert = []
    batch_update = []
    links_data = []
    links_url = app.conf.get('LINKS_RESOLVER_UPDATE_URL')


    #check if we have complete record
    for bibcode in bibcodes:
        r = app.get_record(bibcode)

        if r is None:
            logger.error('The bibcode %s doesn\'t exist!', bibcode)
            continue


        augments_updated = r.get('augments_updated', None)
        bib_data_updated = r.get('bib_data_updated', None)
        fulltext_updated = r.get('fulltext_updated', None)
        metrics_updated = r.get('metrics_updated', None)
        nonbib_data_updated = r.get('nonbib_data_updated', None)
        orcid_claims_updated = r.get('orcid_claims_updated', None)



        year_zero = '1972'
        processed = r.get('processed', adsputils.get_date(year_zero))
        if processed is None:
            processed = adsputils.get_date(year_zero)

        is_complete = all([bib_data_updated, orcid_claims_updated, nonbib_data_updated])

        if is_complete or (force is True and bib_data_updated):

            if force is False and all([
                    augments_updated and augments_updated < processed,
                    bib_data_updated and bib_data_updated < processed,
                    nonbib_data_updated and nonbib_data_updated < processed,
                    orcid_claims_updated and orcid_claims_updated < processed
                   ]):
                logger.debug('Nothing to do for %s, it was already indexed/processed', bibcode)
                continue

            if force:
                logger.debug('Forced indexing of: %s (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s, augments=%s)' % \
                            (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, \
                             metrics_updated, augments_updated))

            # build the solr record
            if update_solr:
                d = solr_updater.transform_json_record(r)
                logger.debug('Built SOLR: %s', d)
                if ignore_checksums or r.get('solr_checksum', None) != app.checksum(d):
                    batch.append(d)
                else:
                    logger.debug('Checksum identical, skipping solr update for: %s', bibcode)

            # get data for metrics
            if update_metrics:
                m = r.get('metrics', None)
                if (m and ignore_checksums) or (m and r.get('metrics_checksum', None) != app.checksum(m)):
                    m['bibcode'] = bibcode
                    logger.debug('Got metrics: %s', m)
                    if r.get('processed'):
                        batch_update.append(m)
                    else:
                        batch_insert.append(m)
                else:
                    logger.debug('Checksum identical, skipping metrics update for: %s', bibcode)

            if update_links and links_url:
                links = app.generate_links_for_resolver(r)
                if links:
                    checksum = app.checksum(links)
                    if ignore_checksums or r.get('datalinks_checksum', None) != checksum:
                        links_data.append(links)
        else:
            # if forced and we have at least the bib data, index it
            if force is True:
                logger.warn('%s is missing bib data, even with force=True, this cannot proceed', bibcode)
            else:
                logger.debug('%s not ready for indexing yet (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s, augments=%s)' % \
                            (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, \
                             metrics_updated, augments_updated))
    if batch or batch_insert or batch_update or links_data:
        app.update_remote_targets(solr=batch, metrics=(batch_insert, batch_update), links=links_data,
                                  commit_solr=commit, solr_urls=solr_targets, update_timestamps=update_timestamps)

Exemplo n.º 3

0

Exibir arquivo

def reindex_records(bibcodes,
                    force=False,
                    update_solr=True,
                    update_metrics=True,
                    update_links=True,
                    commit=False,
                    ignore_checksums=False,
                    solr_targets=None,
                    update_processed=True,
                    priority=0):
    """Receives bibcodes that need production store updated
    Receives bibcodes and checks the database if we have all the
    necessary pieces to push to production store. If not, then postpone and
    send later.
    we consider a record to be ready for solr if these pieces were updated
    (and were updated later than the last 'processed' timestamp):
        - bib_data
        - nonbib_data
        - orcid_claims
    if the force flag is true only bib_data is needed
    for solr, 'fulltext' is not considered essential; but updates to fulltext will
    trigger a solr_update (so it might happen that a document will get
    indexed twice; first with only metadata and later on incl fulltext)
    """

    if isinstance(bibcodes, basestring):
        bibcodes = [bibcodes]

    if not (update_solr or update_metrics or update_links):
        raise Exception('Hmmm, I dont think I let you do NOTHING, sorry!')

    logger.debug('Running index-records for: %s', bibcodes)
    solr_records = []
    metrics_records = []
    links_data_records = []
    solr_records_checksum = []
    metrics_records_checksum = []
    links_data_records_checksum = []
    links_url = app.conf.get('LINKS_RESOLVER_UPDATE_URL')

    if update_solr:
        fields = None  # Load all the fields since solr records grab data from almost everywhere
    else:
        # Optimization: load only fields that will be used
        fields = [
            'bibcode', 'augments_updated', 'bib_data_updated',
            'fulltext_updated', 'metrics_updated', 'nonbib_data_updated',
            'orcid_claims_updated', 'processed'
        ]
        if update_metrics:
            fields += ['metrics', 'metrics_checksum']
        if update_links:
            fields += ['nonbib_data', 'bib_data', 'datalinks_checksum']

    # check if we have complete record
    for bibcode in bibcodes:
        r = app.get_record(bibcode, load_only=fields)

        if r is None:
            logger.error('The bibcode %s doesn\'t exist!', bibcode)
            continue

        augments_updated = r.get('augments_updated', None)
        bib_data_updated = r.get('bib_data_updated', None)
        fulltext_updated = r.get('fulltext_updated', None)
        metrics_updated = r.get('metrics_updated', None)
        nonbib_data_updated = r.get('nonbib_data_updated', None)
        orcid_claims_updated = r.get('orcid_claims_updated', None)

        year_zero = '1972'
        processed = r.get('processed', adsputils.get_date(year_zero))
        if processed is None:
            processed = adsputils.get_date(year_zero)

        is_complete = all(
            [bib_data_updated, orcid_claims_updated, nonbib_data_updated])

        if is_complete or (force is True and bib_data_updated):
            if force is False and all([
                    augments_updated and augments_updated < processed,
                    bib_data_updated and bib_data_updated < processed,
                    nonbib_data_updated and nonbib_data_updated < processed,
                    orcid_claims_updated and orcid_claims_updated < processed
            ]):
                logger.debug(
                    'Nothing to do for %s, it was already indexed/processed',
                    bibcode)
                continue
            if force:
                logger.debug(
                    'Forced indexing of: %s (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s, augments=%s)'
                    % (bibcode, bib_data_updated, orcid_claims_updated,
                       nonbib_data_updated, fulltext_updated, metrics_updated,
                       augments_updated))
            # build the solr record
            if update_solr:
                solr_payload = solr_updater.transform_json_record(r)
                logger.debug('Built SOLR: %s', solr_payload)
                solr_checksum = app.checksum(solr_payload)
                if ignore_checksums or r.get('solr_checksum',
                                             None) != solr_checksum:
                    solr_records.append(solr_payload)
                    solr_records_checksum.append(solr_checksum)
                else:
                    logger.debug(
                        'Checksum identical, skipping solr update for: %s',
                        bibcode)

            # get data for metrics
            if update_metrics:
                metrics_payload = r.get('metrics', None)
                metrics_checksum = app.checksum(metrics_payload or '')
                if (metrics_payload
                        and ignore_checksums) or (metrics_payload and r.get(
                            'metrics_checksum', None) != metrics_checksum):
                    metrics_payload['bibcode'] = bibcode
                    logger.debug('Got metrics: %s', metrics_payload)
                    metrics_records.append(metrics_payload)
                    metrics_records_checksum.append(metrics_checksum)
                else:
                    logger.debug(
                        'Checksum identical or no metrics data available, skipping metrics update for: %s',
                        bibcode)

            if update_links and links_url:
                datalinks_payload = app.generate_links_for_resolver(r)
                if datalinks_payload:
                    datalinks_checksum = app.checksum(datalinks_payload)
                    if ignore_checksums or r.get('datalinks_checksum',
                                                 None) != datalinks_checksum:
                        links_data_records.append(datalinks_payload)
                        links_data_records_checksum.append(datalinks_checksum)
        else:
            # if forced and we have at least the bib data, index it
            if force is True:
                logger.warn(
                    '%s is missing bib data, even with force=True, this cannot proceed',
                    bibcode)
            else:
                logger.debug(
                    '%s not ready for indexing yet (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s, augments=%s)'
                    % (bibcode, bib_data_updated, orcid_claims_updated,
                       nonbib_data_updated, fulltext_updated, metrics_updated,
                       augments_updated))
    if solr_records:
        task_index_solr.apply_async(args=(
            solr_records,
            solr_records_checksum,
        ),
                                    kwargs={
                                        'commit': commit,
                                        'solr_targets': solr_targets,
                                        'update_processed': update_processed
                                    })
    if metrics_records:
        task_index_metrics.apply_async(
            args=(
                metrics_records,
                metrics_records_checksum,
            ),
            kwargs={'update_processed': update_processed})
    if links_data_records:
        task_index_data_links_resolver.apply_async(
            args=(
                links_data_records,
                links_data_records_checksum,
            ),
            kwargs={'update_processed': update_processed})

Exemplo n.º 4

0

Exibir arquivo

Arquivo: tasks.py Projeto: kelockhart/ADSMasterPipeline

def task_index_records(bibcodes,
                       force=False,
                       update_solr=True,
                       update_metrics=True,
                       commit=False):
    """
    This task is (normally) called by the cronjob task
    (that one, quite obviously, is in turn started by cron)
    
    Receives the bibcode of a document that was updated.
    (note: we could have sent the full record however we don't
    do it because the messages might be delayed and we can have
    multiple workers updating the same record; so we want to
    look into the database and get the most recent version)


    Receives bibcodes and checks the database if we have all the
    necessary pieces to push to solr. If not, then postpone and
    push later.

    We consider a record to be 'ready' if those pieces were updated
    (and were updated later than the last 'processed' timestamp):

        - bib_data
        - nonbib_data
        - orcid_claims

    'fulltext' is not considered essential; but updates to fulltext will
    trigger a solr_update (so it might happen that a document will get
    indexed twice; first with only metadata and later on incl fulltext)

    """

    if not (update_solr or update_metrics):
        raise Exception('Hmmm, I dont think I let you do NOTHING, sorry!')

    logger.debug('Running index-records for: %s', bibcodes)
    batch = []
    batch_insert = []
    batch_update = []

    #check if we have complete record
    for bibcode in bibcodes:
        r = app.get_record(bibcode)

        if r is None:
            logger.error('The bibcode %s doesn\'t exist!', bibcode)
            continue

        bib_data_updated = r.get('bib_data_updated', None)
        orcid_claims_updated = r.get('orcid_claims_updated', None)
        nonbib_data_updated = r.get('nonbib_data_updated', None)
        fulltext_updated = r.get('fulltext_updated', None)
        metrics_updated = r.get('metrics_updated', None)

        year_zero = '1972'
        processed = r.get('processed', adsputils.get_date(year_zero))
        if processed is None:
            # It was never sent to Solr
            processed = adsputils.get_date(year_zero)

        is_complete = all(
            [bib_data_updated, orcid_claims_updated, nonbib_data_updated])

        if is_complete or (force is True and bib_data_updated):

            if force is False and all([
                    bib_data_updated and bib_data_updated < processed,
                    orcid_claims_updated and orcid_claims_updated < processed,
                    nonbib_data_updated and nonbib_data_updated < processed
            ]):
                logger.debug(
                    'Nothing to do for %s, it was already indexed/processed',
                    bibcode)
                continue

            if force:
                logger.warn('Forced indexing of: %s (metadata=%s, orcid=%s, nonbib=%s, fulltext=%s, metrics=%s)' % \
                            (bibcode, bib_data_updated, orcid_claims_updated, nonbib_data_updated, fulltext_updated, \
                             metrics_updated))

            # build the solr record
            if update_solr:
                d = solr_updater.transform_json_record(r)
                logger.debug('Built SOLR: %s', d)
                batch.append(d)
            # get data for metrics
            if update_metrics:
                m = r.get('metrics', None)
                if m:
                    m['bibcode'] = bibcode
                    logger.debug('Got metrics: %s', m)
                    if r.get('processed'):
                        batch_update.append(m)
                    else:
                        batch_insert.append(m)
        else:
            # if forced and we have at least the bib data, index it
            if force is True:
                logger.warn(
                    '%s is missing bib data, even with force=True, this cannot proceed',
                    bibcode)
            else:
                logger.debug('%s not ready for indexing yet', bibcode)

    failed_bibcodes = None
    if len(batch):
        failed_bibcodes = app.reindex(batch,
                                      app.conf.get('SOLR_URLS'),
                                      commit=commit)

    if failed_bibcodes and len(failed_bibcodes):
        logger.warn('Some bibcodes failed: %s', failed_bibcodes)
        failed_bibcodes = set(failed_bibcodes)

        # when solr_urls > 1, some of the servers may have successfully indexed
        # but here we are refusing to pass data to metrics db; this seems the
        # right choice because there is only one metrics db (but if we had many,
        # then we could differentiate)

        batch_insert = filter(lambda x: x['bibcode'] not in failed_bibcodes,
                              batch_insert)
        batch_update = filter(lambda x: x['bibcode'] not in failed_bibcodes,
                              batch_update)

    if len(batch_insert) or len(batch_update):
        app.update_metrics_db(batch_insert, batch_update)

Exemplo n.º 5

0

Exibir arquivo

    def test_solr_transformer(self):
        """Makes sure we can write recs into the storage."""
        
        self.app.update_storage('bibcode', 'metadata', {u'abstract': u'abstract text',
             u'aff': [u'-', u'-', u'-', u'-'],
             u'alternate_bibcode': [u'2003adass..12..283B'],
             u'author': [u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.', u'Winkelman, S.'],
             u'author_count': 4,
             u'author_facet': [u'Blecksmith, E',
              u'Paltani, S',
              u'Rots, A',
              u'Winkelman, S'],
             u'author_facet_hier': [u'0/Blecksmith, E',
              u'1/Blecksmith, E/Blecksmith, E.',
              u'0/Paltani, S',
              u'1/Paltani, S/Paltani, S.',
              u'0/Rots, A',
              u'1/Rots, A/Rots, A.',
              u'0/Winkelman, S',
              u'1/Winkelman, S/Winkelman, S.'],
             u'author_norm': [u'Blecksmith, E',
              u'Paltani, S',
              u'Rots, A',
              u'Winkelman, S'],
             u'bibcode': u'2003ASPC..295..283B',
             u'bibgroup': [u'CXC', u'CfA'],
             u'bibgroup_facet': [u'CXC', u'CfA'],
             u'bibstem': [u'ASPC', u'ASPC..295'],
             u'bibstem_facet': u'ASPC',
             u'database': [u'astronomy'],
             u'date': u'2003-01-01T00:00:00.000000Z',
             u'doctype': u'inproceedings',
             u'doctype_facet_hier': [u'0/Article', u'1/Article/Proceedings Article'],
             u'email': [u'-', u'-', u'-', u'-'],
             u'first_author': u'Blecksmith, E.',
             u'first_author_facet_hier': [u'0/Blecksmith, E',
              u'1/Blecksmith, E/Blecksmith, E.'],
             u'first_author_norm': u'Blecksmith, E',
             u'id': u'1401492',
             u'identifier': [u'2003adass..12..283B'],
             u'links_data': u'',   ### TODO(rca): superconfusing string, but fortunately we are getting ridd of it
             u'orcid_pub': [u'-', u'-', u'-', u'-'],
             u'page': [u'283'],
             u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'],
             u'pub': u'Astronomical Data Analysis Software and Systems XII',
             u'pub_raw': u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283',
             u'pubdate': u'2003-00-00',
             u'title': [u'Chandra Data Archive Download and Usage Database'],
             u'volume': u'295',
             u'year': u'2003'})
        self.app.update_storage('bibcode', 'fulltext', 'fulltext')
        self.app.update_storage('bibcode', 'metrics', {"downloads": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 2], 
                                                       "bibcode": "2003ASPC..295..361M", 
                                                       "reads": [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 2, 5, 1, 0, 0, 1, 0, 0, 2, 4, 5], 
                                                       "author_num": 2})
        self.app.update_storage('bibcode', 'orcid_claims', {'authors': ['Blecksmith, E.', 'Paltani, S.', 'Rots, A.', 'Winkelman, S.'],
             'bibcode': '2003ASPC..295..283B',
             'unverified': ['-', '-', '0000-0003-2377-2356', '-']})
        self.app.update_storage('bibcode', 'nonbib_data', {u'authors': [u'Zaus, E',
              u'Tedde, S',
              u'Fuerst, J',
              u'Henseler, D',
              u'Doehler, G'],
             u'bibcode': u'2007JAP...101d4501Z',
             u'boost': 0.1899999976158142,
             u'citation_count': 6,
             u'citations': [u'2007ApPhL..91g1118P',
              u'2010ApPhA..99..805K',
              u'2011TSF...520..610L',
              u'2012NatCo...3E1175B',
              u'2014IPTL...26..305A',
              u'2016ITED...63..197G'],
             u'downloads': [0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              1,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0],
             u'id': 7862455,
             u'norm_cites': 4225,
             u'reads': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 2, 1, 0, 0, 1, 0, 1, 0, 0],
             u'refereed': True,
             u'reference': [u'1977JAP....48.4729M',
              u'1981psd..book.....S',
              u'1981wi...book.....S',
              u'1986PhRvB..33.5545M',
              u'1987ApPhL..51..913T',
              u'1992Sci...258.1474S',
              u'1994IJMPB...8..237S',
              u'1995Natur.376..498H',
              u'1995Sci...270.1789Y',
              u'1998TSF...331...76O',
              u'1999Natur.397..121F',
              u'2000JaJAP..39...94P',
              u'2002ApPhL..81.3885S',
              u'2004ApPhL..85.3890C',
              u'2004TSF...451..105S',
              u'2005PhRvB..72s5208M',
              u'2006ApPhL..89l3505L']})
        
        rec = self.app.get_record('bibcode')
        self.assertDictContainsSubset({u'abstract': u'abstract text',
             u'aff': [u'-', u'-', u'-', u'-'],
             u'alternate_bibcode': [u'2003adass..12..283B'],
             u'author': [u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.', u'Winkelman, S.'],
             u'author_count': 4,
             u'author_facet': [u'Blecksmith, E',
              u'Paltani, S',
              u'Rots, A',
              u'Winkelman, S'],
             u'author_facet_hier': [u'0/Blecksmith, E',
              u'1/Blecksmith, E/Blecksmith, E.',
              u'0/Paltani, S',
              u'1/Paltani, S/Paltani, S.',
              u'0/Rots, A',
              u'1/Rots, A/Rots, A.',
              u'0/Winkelman, S',
              u'1/Winkelman, S/Winkelman, S.'],
             u'author_norm': [u'Blecksmith, E',
              u'Paltani, S',
              u'Rots, A',
              u'Winkelman, S'],
             'bibcode': u'2003ASPC..295..283B',
             u'bibgroup': [u'CXC', u'CfA'],
             u'bibgroup_facet': [u'CXC', u'CfA'],
             u'bibstem': [u'ASPC', u'ASPC..295'],
             u'bibstem_facet': u'ASPC',
             'body': u'fulltext',
             'citation': [u'2007ApPhL..91g1118P',
              u'2010ApPhA..99..805K',
              u'2011TSF...520..610L',
              u'2012NatCo...3E1175B',
              u'2014IPTL...26..305A',
              u'2016ITED...63..197G'],
             'citation_count': 6,
             'cite_read_boost': 0.1899999976158142,
             u'database': [u'astronomy'],
             u'date': u'2003-01-01T00:00:00.000000Z',
             u'doctype': u'inproceedings',
             u'doctype_facet_hier': [u'0/Article', u'1/Article/Proceedings Article'],
             u'email': [u'-', u'-', u'-', u'-'],
             u'first_author': u'Blecksmith, E.',
             u'first_author_facet_hier': [u'0/Blecksmith, E',
              u'1/Blecksmith, E/Blecksmith, E.'],
             u'first_author_norm': u'Blecksmith, E',
             'id': u'1401492',
             u'identifier': [u'2003adass..12..283B'],
             u'links_data': u'',
             'orcid_other' : [u'-', u'-', u'0000-0003-2377-2356', u'-'],
             u'orcid_pub': [u'-', u'-', u'-', u'-'],
             u'page': [u'283'],
             u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'],
             u'pub': u'Astronomical Data Analysis Software and Systems XII',
             u'pub_raw': u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283',
             u'pubdate': u'2003-00-00',
             'reference': [u'1977JAP....48.4729M',
              u'1981psd..book.....S',
              u'1981wi...book.....S',
              u'1986PhRvB..33.5545M',
              u'1987ApPhL..51..913T',
              u'1992Sci...258.1474S',
              u'1994IJMPB...8..237S',
              u'1995Natur.376..498H',
              u'1995Sci...270.1789Y',
              u'1998TSF...331...76O',
              u'1999Natur.397..121F',
              u'2000JaJAP..39...94P',
              u'2002ApPhL..81.3885S',
              u'2004ApPhL..85.3890C',
              u'2004TSF...451..105S',
              u'2005PhRvB..72s5208M',
              u'2006ApPhL..89l3505L'],
             u'title': [u'Chandra Data Archive Download and Usage Database'],
             u'volume': u'295',
             u'year': u'2003'},
        solr_updater.transform_json_record(rec))

        for x in Records._date_fields:
            if x in rec:
                rec[x] = get_date('2017-09-19T21:17:12.026474+00:00')
        
        x = solr_updater.transform_json_record(rec)
        for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime', 'nonbib_mtime', 'metrics_mtime', 'update_timestamp'):
            self.assertEquals(x[f], '2017-09-19T21:17:12.026474Z')
        
        rec['orcid_claims_updated'] = get_date('2017-09-20T21:17:12.026474+00:00')
        x = solr_updater.transform_json_record(rec)
        for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime', 'nonbib_mtime', 'metrics_mtime', 'update_timestamp'):
            if f == 'update_timestamp' or f == 'orcid_mtime':
                self.assertEquals(x[f], '2017-09-20T21:17:12.026474Z')
            else:
                self.assertEquals(x[f], '2017-09-19T21:17:12.026474Z')

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_solr_updater.py Projeto: seasidesparrow/ADSMasterPipeline

    def test_solr_transformer(self):
        """Makes sure we can write recs into the storage."""

        self.app.update_storage('bibcode', 'metadata', {u'abstract': u'abstract text',
             u'aff': [u'-', u'-', u'-', u'-'],
             u'alternate_bibcode': [u'2003adass..12..283B'],
             u'author': [u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.', u'Winkelman, S.'],
             u'author_count': 4,
             u'author_facet': [u'Blecksmith, E',
              u'Paltani, S',
              u'Rots, A',
              u'Winkelman, S'],
             u'author_facet_hier': [u'0/Blecksmith, E',
              u'1/Blecksmith, E/Blecksmith, E.',
              u'0/Paltani, S',
              u'1/Paltani, S/Paltani, S.',
              u'0/Rots, A',
              u'1/Rots, A/Rots, A.',
              u'0/Winkelman, S',
              u'1/Winkelman, S/Winkelman, S.'],
             u'author_norm': [u'Blecksmith, E',
              u'Paltani, S',
              u'Rots, A',
              u'Winkelman, S'],
             u'bibcode': u'2003ASPC..295..283B',
             u'bibgroup': [u'bibCXC', u'CfA'],
             u'bibgroup_facet': [u'bibCXC', u'CfA'],
             u'bibstem': [u'ASPC', u'ASPC..295'],
             u'bibstem_facet': u'ASPC',
             u'database': [u'astronomy'],
             u'date': u'2003-01-01T00:00:00.000000Z',
             u'doctype': u'inproceedings',
             u'doctype_facet_hier': [u'0/Article', u'1/Article/Proceedings Article'],
             u'editor': [u'Testeditor, Z.'],
             u'email': [u'-', u'-', u'-', u'-'],
             u'first_author': u'Blecksmith, E.',
             u'first_author_facet_hier': [u'0/Blecksmith, E',
              u'1/Blecksmith, E/Blecksmith, E.'],
             u'first_author_norm': u'Blecksmith, E',
             u'id': u'1401492',
             u'identifier': [u'2003adass..12..283B'],
             u'links_data': u'',   ### TODO(rca): superconfusing string, but fortunately we are getting ridd of it
             u'orcid_pub': [u'-', u'-', u'-', u'-'],
             u'page': [u'283'],
             #u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'],
             u'pub': u'Astronomical Data Analysis Software and Systems XII',
             u'pub_raw': u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283',
             u'pubdate': u'2003-00-00',
             u'title': [u'Chandra Data Archive Download and Usage Database'],
             u'volume': u'295',
             u'year': u'2003'})
        self.app.update_storage('bibcode', 'fulltext', {'body': 'texttext', 'acknowledgements': 'aaa', 'dataset': ['a', 'b', 'c'], 'facility': ['fac1', 'fac2', 'fac3']})
        self.app.update_storage('bibcode', 'metrics', {"downloads": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 2],
                                                       "bibcode": "2003ASPC..295..361M",
                                                       "reads": [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 2, 5, 1, 0, 0, 1, 0, 0, 2, 4, 5],
                                                       "author_num": 2})
        self.app.update_storage('bibcode', 'orcid_claims', {'authors': ['Blecksmith, E.', 'Paltani, S.', 'Rots, A.', 'Winkelman, S.'],
             'bibcode': '2003ASPC..295..283B',
             'unverified': ['-', '-', '0000-0003-2377-2356', '-']})
        self.app.update_storage('bibcode', 'metrics', {
             u'citation_num': 6,
             u'citations': [u'2007ApPhL..91g1118P',
              u'2010ApPhA..99..805K',
              u'2011TSF...520..610L',
              u'2012NatCo...3E1175B',
              u'2014IPTL...26..305A',
              u'2016ITED...63..197G']})
        self.app.update_storage('bibcode', 'nonbib_data', {u'authors': [u'Zaus, E',
              u'Tedde, S',
              u'Fuerst, J',
              u'Henseler, D',
              u'Doehler, G'],
              u'bibcode': u'2007JAP...101d4501Z',
              u'bibgroup': [u'CXC', u'CfA'],
              u'bibgroup_facet': [u'CXC', u'CfA'],
             u'boost': 0.1899999976158142,
             u'data': [u'MAST:3', u'SIMBAD:1'],
             u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'],
             u'downloads': [0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              1,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0],
             u'id': 7862455,
             u'norm_cites': 4225,
             u'reads': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 6, 2, 1, 0, 0, 1, 0, 1, 0, 0],
             u'refereed': True,
             u'reference': [u'1977JAP....48.4729M',
              u'1981psd..book.....S',
              u'1981wi...book.....S',
              u'1986PhRvB..33.5545M',
              u'1987ApPhL..51..913T',
              u'1992Sci...258.1474S',
              u'1994IJMPB...8..237S',
              u'1995Natur.376..498H',
              u'1995Sci...270.1789Y',
              u'1998TSF...331...76O',
              u'1999Natur.397..121F',
              u'2000JaJAP..39...94P',
              u'2002ApPhL..81.3885S',
              u'2004ApPhL..85.3890C',
              u'2004TSF...451..105S',
              u'2005PhRvB..72s5208M',
              u'2006ApPhL..89l3505L'],
             u'simbad_objects': [u'2419335 sim', u'3111723 sim*'],
             u'ned_objects': [u'2419335 HII', u'3111723 ned*'],
             u'grants': [u'2419335 g', u'3111723 g*'],
             u'citation_count': 6,
             u'citation_count_norm': .2,
             })
        rec = self.app.get_record('bibcode')
        x = solr_updater.transform_json_record(rec)
        # self.assertFalse('aff' in x, 'virtual field should not be in solr output')
        
        self.assertTrue(x['aff'] == rec['bib_data']['aff'],
                        'solr record should include aff from bib data when augment is not available')
        self.assertFalse('aff_abbrev' in x,
                         'augment field should not be in solr record when augment is not available')
        
        self.app.update_storage('bibcode', 'augment',
                                {u'aff': [u'augment pipeline aff', u'-', u'-', u'-'],
                                 u'aff_abbrev': [u'-', u'-', u'-', u'-'],
                                 u'aff_canonical': [u'-', u'-', u'-', u'-'],
                                 u'aff_facet': [u'-', u'-', u'-', u'-'],
                                 u'aff_facet_hier': [u'-', u'-', u'-', u'-'],
                                 u'aff_id': [u'-', u'-', u'-', u'-'],
                                 u'institution': [u'-', u'-', u'-', u'-']})

        rec = self.app.get_record('bibcode')
        self.assertDictContainsSubset({u'abstract': u'abstract text',
             u'ack': u'aaa',
             u'aff_abbrev': [u'-', u'-', u'-', u'-'],
             u'aff_canonical': [u'-', u'-', u'-', u'-'],
             u'aff_facet': [u'-', u'-', u'-', u'-'],
             u'aff_facet_hier': [u'-', u'-', u'-', u'-'],
             u'aff_id': [u'-', u'-', u'-', u'-'],
             u'institution': [u'-', u'-', u'-', u'-'],
             u'alternate_bibcode': [u'2003adass..12..283B'],
             u'author': [u'Blecksmith, E.', u'Paltani, S.', u'Rots, A.', u'Winkelman, S.'],
             u'author_count': 4,
             u'author_facet': [u'Blecksmith, E',
              u'Paltani, S',
              u'Rots, A',
              u'Winkelman, S'],
             u'author_facet_hier': [u'0/Blecksmith, E',
              u'1/Blecksmith, E/Blecksmith, E.',
              u'0/Paltani, S',
              u'1/Paltani, S/Paltani, S.',
              u'0/Rots, A',
              u'1/Rots, A/Rots, A.',
              u'0/Winkelman, S',
              u'1/Winkelman, S/Winkelman, S.'],
             u'author_norm': [u'Blecksmith, E',
              u'Paltani, S',
              u'Rots, A',
              u'Winkelman, S'],
             'bibcode': u'2003ASPC..295..283B',
             u'bibgroup': [u'CXC', u'CfA'],
             u'bibgroup_facet': [u'CXC', u'CfA'],
             u'bibstem': [u'ASPC', u'ASPC..295'],
             u'bibstem_facet': u'ASPC',
             'body': u'texttext',
             'citation': [u'2007ApPhL..91g1118P',
              u'2010ApPhA..99..805K',
              u'2011TSF...520..610L',
              u'2012NatCo...3E1175B',
              u'2014IPTL...26..305A',
              u'2016ITED...63..197G'],
             'citation_count': 6,
             'citation_count_norm': .2,
             'cite_read_boost': 0.1899999976158142,
             u'data': [u'MAST:3', u'SIMBAD:1'],
             u'data_facet': [u'MAST', u'SIMBAD'],
             u'database': [u'astronomy'],
             #u'dataset': ['a', 'b', 'c'],
             u'date': u'2003-01-01T00:00:00.000000Z',
             u'doctype': u'inproceedings',
             u'doctype_facet_hier': [u'0/Article', u'1/Article/Proceedings Article'],
             u'editor': [u'Testeditor, Z.'],
             u'email': [u'-', u'-', u'-', u'-'],
             u'facility': ['fac1', 'fac2', 'fac3'],
             u'first_author': u'Blecksmith, E.',
             u'first_author_facet_hier': [u'0/Blecksmith, E',
              u'1/Blecksmith, E/Blecksmith, E.'],
             u'first_author_norm': u'Blecksmith, E',
             u'id': 1,  # from id in master database records table
             u'identifier': [u'2003adass..12..283B'],
             u'links_data': u'',
             'orcid_other' : [u'-', u'-', u'0000-0003-2377-2356', u'-'],
             u'orcid_pub': [u'-', u'-', u'-', u'-'],
             u'nedid': [u'2419335', u'3111723'],
             u'nedtype': [u'HII Region', u'Other'],
             u'ned_object_facet_hier': [u'0/HII Region', u'1/HII Region/2419335', u'0/Other', u'1/Other/3111723'],
             u'page': [u'283'],
             u'property': [u'OPENACCESS', u'ADS_OPENACCESS', u'ARTICLE', u'NOT REFEREED'],
             u'pub': u'Astronomical Data Analysis Software and Systems XII',
             u'pub_raw': u'Astronomical Data Analysis Software and Systems XII ASP Conference Series, Vol. 295, 2003 H. E. Payne, R. I. Jedrzejewski, and R. N. Hook, eds., p.283',
             u'pubdate': u'2003-00-00',
             u'read_count': 0,
             'reference': [u'1977JAP....48.4729M',
              u'1981psd..book.....S',
              u'1981wi...book.....S',
              u'1986PhRvB..33.5545M',
              u'1987ApPhL..51..913T',
              u'1992Sci...258.1474S',
              u'1994IJMPB...8..237S',
              u'1995Natur.376..498H',
              u'1995Sci...270.1789Y',
              u'1998TSF...331...76O',
              u'1999Natur.397..121F',
              u'2000JaJAP..39...94P',
              u'2002ApPhL..81.3885S',
              u'2004ApPhL..85.3890C',
              u'2004TSF...451..105S',
              u'2005PhRvB..72s5208M',
              u'2006ApPhL..89l3505L'],
             u'simbid': ['2419335', '3111723'],
             u'simbtype': [u'Other', u'Star'],
             u'simbad_object_facet_hier': [u'0/Other', u'1/Other/2419335', u'0/Star', u'1/Star/3111723'],
             u'title': [u'Chandra Data Archive Download and Usage Database'],
             u'volume': u'295',
             u'year': u'2003'},

        solr_updater.transform_json_record(rec))

        for x in Records._date_fields:
            if x in rec:
                rec[x] = get_date('2017-09-19T21:17:12.026474+00:00')

        x = solr_updater.transform_json_record(rec)
        for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime', 'nonbib_mtime', 'metrics_mtime', 'update_timestamp'):
            self.assertEqual(x[f], '2017-09-19T21:17:12.026474Z')

        rec['orcid_claims_updated'] = get_date('2017-09-20T21:17:12.026474+00:00')
        x = solr_updater.transform_json_record(rec)
        for f in ('metadata_mtime', 'fulltext_mtime', 'orcid_mtime', 'nonbib_mtime', 'metrics_mtime', 'update_timestamp'):
            if f == 'update_timestamp' or f == 'orcid_mtime':
                self.assertEqual(x[f], '2017-09-20T21:17:12.026474Z')
            else:
                self.assertEqual(x[f], '2017-09-19T21:17:12.026474Z')

        rec = self.app.get_record('bibcode')
        x = solr_updater.transform_json_record(rec)

        self.assertTrue('aff' in x)  # aff is no longer a virtual field
        self.assertEqual(x['aff'], rec['augments']['aff'])  # solr record should prioritize aff data from augment
        self.assertEqual(x['aff_abbrev'], rec['augments']['aff_abbrev'])  # solr record should include augment data
        self.assertEqual(x['bibgroup'], rec['nonbib_data']['bibgroup'])
        self.assertEqual(x['bibgroup_facet'], rec['nonbib_data']['bibgroup_facet'])

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_solr_updater.py Projeto: seasidesparrow/ADSMasterPipeline

    def test_links_data_merge(self):
        # links_data only from bib
        db_record = {'bibcode': 'foo',
                     'bib_data': {'links_data': ['{"url": "http://asdf"}']},
                     'bib_data_updated': datetime.now()}
        solr_record = solr_updater.transform_json_record(db_record)
        self.assertEqual(db_record['bib_data']['links_data'], solr_record['links_data'])
        db_record = {'bibcode': 'foo',
                     'bib_data': {'links_data': ['{"url": "http://asdf"}']},
                     'bib_data_updated': datetime.now()}
        solr_record = solr_updater.transform_json_record(db_record)
        self.assertEqual(db_record['bib_data']['links_data'], solr_record['links_data'])

        # links_data only from nonbib
        db_record = {'bibcode': 'foo',
                     'nonbib_data': {'links_data': 'asdf'},
                     'nonbib_data_updated': datetime.now()}
        solr_record = solr_updater.transform_json_record(db_record)
        self.assertEqual(db_record['nonbib_data']['links_data'], solr_record['links_data'])

        # links_data from both
        db_record = {'bibcode': 'foo',
                     'bib_data': {'links_data': 'asdf'},
                     'bib_data_updated': datetime.now(),
                     'nonbib_data': {'links_data': 'jkl'},
                     'nonbib_data_updated': datetime.now() - timedelta(1)}
        solr_record = solr_updater.transform_json_record(db_record)
        self.assertEqual(db_record['nonbib_data']['links_data'], solr_record['links_data'])

        db_record = {'bibcode': 'foo',
                     'bib_data': {'links_data': 'asdf'},
                     'bib_data_updated': datetime.now() - timedelta(1),
                     'nonbib_data': {'links_data': 'jkl'},
                     'nonbib_data_updated': datetime.now()}
        solr_record = solr_updater.transform_json_record(db_record)
        self.assertEqual(db_record['nonbib_data']['links_data'], solr_record['links_data'])

        db_record = {'bibcode': 'foo',
                     'bib_data': {'links_data': ['{"url": "http://foo", "access": "open"}']},
                     'bib_data_updated': datetime.now()}
        solr_record = solr_updater.transform_json_record(db_record)
        self.assertTrue('ESOURCE' in solr_record['property'])
        # verify all values are populated
        self.assertTrue('ARTICLE' in solr_record['property'])
        self.assertTrue('NOT REFEREED' in solr_record['property'])
        self.assertTrue('EPRINT_OPENACCESS' in solr_record['property'])
        self.assertTrue('OPENACCESS' in solr_record['property'])
        self.assertTrue('EPRINT_HTML' in solr_record['esources'])
        self.assertTrue('EPRINT_PDF' in solr_record['esources'])

        db_record = {'bibcode': 'foo',
                     'bib_data': {'links_data': ['{"url": "http://foo", "access": "closed"}']},
                     'bib_data_updated': datetime.now()}
        solr_record = solr_updater.transform_json_record(db_record)
        self.assertTrue('ESOURCE' not in solr_record['property'])

        db_record = {'bibcode': 'foo',
                     'bib_data': {},
                     'bib_data_updated': datetime.now()}
        solr_record = solr_updater.transform_json_record(db_record)
        self.assertTrue('property' not in solr_record)