Exemplo n.º 1
0
def test_mosdef_only(testresourcepath, expected_modout1):
    modin = newmodel()
    literate.parse(INPUT_GRAPH_1, modin)

    modin = newmodel()
    literate.parse(INPUT_GRAPH_1, modin)

    FINGERPRINT_RULES = {
        SCH_NS('MusicAlbum'):
        (if_(contains(follow(SCH_NS('byArtist')), DOC_NS('md')),
             materialize(COPY()))),
        SCH_NS('Person'): (materialize(COPY())),
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, {}, {})

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('=' * 10, 'test_mosdef_only', '=' * 10)
    literate.write(modout)
    # import pprint; pprint.pprint(list(iter(modout)))

    assert len(modout) == 17
    assert len(
        list(util.all_origins(modout, only_types={SCH_NS('MusicAlbum')}))) == 1
    assert len(list(util.all_origins(modout,
                                     only_types={SCH_NS('Person')}))) == 3
Exemplo n.º 2
0
    def run(self, input_model=None, raw_source=None, output_model=None, **kwargs):
        '''
        Process an input, either an input Versa model or in some raw record format
        through a sequence of transform stages, to generate a versa model of output resources

        Caller must provide either an input_model or a raw_source, but can provide
        any combination of these, depending on the expectations of the defined stages
        
        Args:
            input_model: Versa model which serves as the starting point
            raw_source: raw input data, a possible optimization if it's impractical to directly
                represent as a Versa model, but can be interpreted by the stages as if it were
            output_model: optional output model, which might be provided to add transform results
                to existing data, or to use a specialized Versa model implementation
            kwargs: any additional parameters which are passed as they are to all the stages

        Returns:
            output_model: Same reference as the input output_model, if provided, otherwise a new
                model containing the results of the transform
        '''
        self.check_update_stages()

        self.input_model = newmodel() if input_model is None else input_model
        self.output_model = newmodel() if output_model is None else output_model

        self._raw_source = raw_source
        self.fingerprints = {}

        # First tuple item is just sortkey, so discarded 
        for _, stage in self._stages:
            retval = stage(**kwargs)
            if retval is False:
                #Signal to abort
                break
        return self.output_model
Exemplo n.º 3
0
def test_basics_1(testresourcepath, expected_modout1):
    modin = newmodel()
    modin_fpath = 'schemaorg/catcherintherye-ugly.md'
    literate.parse(
        open(os.path.join(testresourcepath, modin_fpath)).read(), modin)

    FINGERPRINT_RULES = {
        SCH_NS('Book'): (materialize(
            BF_NS('Instance'),
            fprint=[
                (BF_NS('isbn'), follow(SCH_NS('isbn'))),
            ],
        ))
    }

    TRANSFORM_RULES = {
        SCH_NS('name'):
        link(rel=BF_NS('name')),
        SCH_NS('author'):
        materialize(BF_NS('Person'),
                    BF_NS('creator'),
                    vars={
                        'birthDate':
                        follow(SCH_NS('authorBirthDate'),
                               origin=var('input-resource'))
                    },
                    fprint=[
                        (BF_NS('name'), target()),
                        (BF_NS('birthDate'), var('birthDate')),
                    ],
                    links=[
                        (BF_NS('name'), target()),
                        (BF_NS('birthDate'), var('birthDate')),
                    ]),
    }

    modout = newmodel()

    def new_entity_hook(eid):
        # Add a triple to each materialized resource
        modout.add(eid, 'http://example.org/materializedBy', 'py.test')
        return

    ctxextras = {'@new-entity-hook': new_entity_hook}
    root_ctx = DUMMY_CONTEXT.copy(output_model=modout, extras=ctxextras)

    ppl = generic_pipeline(FINGERPRINT_RULES,
                           TRANSFORM_RULES,
                           LABELIZE_RULES,
                           root_ctx=root_ctx)

    ppl.run(input_model=modin, output_model=modout)
    # Use -s to see this
    print('=' * 10, 'test_basics_1', '=' * 10)
    literate.write(modout)

    assert len(
        list(modout.match(None, 'http://example.org/materializedBy',
                          None))) == 2
Exemplo n.º 4
0
def test_simpleobj_usecase1():
    m = newmodel()
    tmpl = Template(
        '# http://example.org#{{ Wikidata }}\n\n * <http://example.org/voc/copyright>: {{ _["©"] }}'
    )
    # use -s option to see the nosy print
    m = newmodel()
    parse(objmock(), tmpl, m, nosy=print)

    assert len(m) == 1, repr(m)
    assert ('http://example.org#Q15761337', 'http://example.org/voc/copyright',
            '2016', {}) == next(m.match())
Exemplo n.º 5
0
def test_versa_syntax1(testresourcepath):
    config = {
        'autotype-h1': 'http://example.org/r1',
        'autotype-h2': 'http://example.org/r2',
        'interpretations': {
            VERSA_BASEIRI + 'refines': VERSA_BASEIRI + 'resourceset',
            VERSA_BASEIRI + 'properties': VERSA_BASEIRI + 'resourceset',
            VERSA_BASEIRI + 'synonyms': VERSA_BASEIRI + 'resourceset'
        }
    }

    m1 = newmodel(baseiri='http://example.org/')
    # from_markdown(VERSA_LITERATE1, m, encoding='utf-8')
    doc = open(os.path.join(testresourcepath, 'doc1.md')).read()
    literate.parse(doc, m1, config=config)

    m2 = newmodel(baseiri='http://example.org/')
    # from_markdown(VERSA_LITERATE1, m, encoding='utf-8')
    doc = open(os.path.join(testresourcepath, 'doc1.abbr.md')).read()
    literate.parse(doc, m2, config=config)

    # logging.debug('VERSA LITERATE EXAMPLE 1')
    equiv_results = [list(m1.match()), list(m2.match())]
    for results in equiv_results:
        # import pprint; pprint.pprint(results)
        assert len(results) == 6
        assert (I('http://uche.ogbuji.net/ndewo/'),
                I('http://bibfra.me/purl/versa/type'), 'http://example.org/r1',
                {}) in results
        assert (I('http://uche.ogbuji.net/ndewo/'),
                I('http://www.w3.org/TR/html5/title'), 'Ndewo, Colorado', {
                    '@lang': None
                }) in results
        assert (I('http://uche.ogbuji.net/ndewo/'),
                I('http://www.w3.org/TR/html5/link-type/author'),
                I('http://uche.ogbuji.net/'), {
                    I('http://www.w3.org/TR/html5/link/description'):
                    'Uche Ogbuji'
                }) in results
        assert (
            I('http://uche.ogbuji.net/ndewo/'),
            I('http://www.w3.org/TR/html5/link-type/see-also'),
            I('http://www.goodreads.com/book/show/18714145-ndewo-colorado'), {
                I('http://www.w3.org/TR/html5/link/label'): 'Goodreads'
            }) in results
        assert (I('http://uche.ogbuji.net/'),
                I('http://bibfra.me/purl/versa/type'), 'http://example.org/r1',
                {}) in results
        assert (I('http://uche.ogbuji.net/'),
                I('http://www.w3.org/TR/html5/link-type/see-also'),
                I('http://uche.ogbuji.net/ndewo/'), {}) in results
Exemplo n.º 6
0
def main(source):
    'Transform CSV SOURCE file to BF Lite in Versa'
    ppl = csv_bibframe_pipeline()
    input_model = newmodel()
    with open(source) as csvfp:
        for row_model in csv.parse_iter(csvfp, VLITERATE_TEMPLATE):
            if row_model: input_model.update(row_model)

    # Debug print of input model
    # literate.write([input_model], out=sys.stdout)
    output_model = ppl.run(input_model=input_model)
    print('Low level JSON dump of output data model: ')
    util.jsondump(output_model, sys.stdout)
    print('\n')  # 2 CRs
    print('Versa literate form of output: ')
    literate.write(output_model, out=sys.stdout)

    print('Diagram from extracted a sample: ')
    out_resources = []
    for vs in ppl.fingerprints.values():
        out_resources.extend(vs)
    ITYPE = BF_NS('Instance')
    instances = [
        r for r in out_resources
        if ITYPE in util.resourcetypes(output_model, r)
    ]
    zoomed, _ = util.zoom_in(output_model, random.choice(instances), depth=2)
    mermaid.write(zoomed)
Exemplo n.º 7
0
def test_book_cases(label, transforms, asserter):
    idg = idgen(EXAMPLE_ORG)
    existing_ids = set()
    out_m = newmodel()

    rid = SIMPLE_BOOK['id']
    out_m.add(rid, VTYPE_REL, BOOK_TYPE)

    in_m = newmodel()
    for k, v in SIMPLE_BOOK.items():
        ctxlink = (rid, k, v, {})
        func = transforms.get(k)
        if func:
            ctx = context(ctxlink, in_m, out_m, base=SCHEMA_ORG, idgen=idg)
            func(ctx)

    asserter(out_m)
Exemplo n.º 8
0
def test_csv_usecase1():
    m = newmodel()
    tmpl = '# http://example.org#{Wikidata}\n\n * <http://example.org/voc/copyright>: {%C2%A9}'
    # use -s option to see the nosy print
    m = next(parse_iter(object(), tmpl, csv_fact=csvmock, nosy=print))
    
    assert len(m) == 1, repr(m)
    assert ('http://example.org#Q15761337', 'http://example.org/voc/copyright', '2016', {}) == next(m.match())
Exemplo n.º 9
0
def Xtest_versa_syntax1():
    # logging.debug(recs)
    m = newmodel()
    m.create_space()
    # from_markdown(VERSA_LITERATE1, m, encoding='utf-8')
    literate.parse(VERSA_LITERATE1, m)
    logging.debug('VERSA LITERATE EXAMPLE 1')
    for link in m.match():
        logging.debug('Result: {0}'.format(repr(link)))
Exemplo n.º 10
0
def test_parse1(ntrips_1):
    m = newmodel()
    parse(ntrips_1, m)
    assert len(m) == 3, repr(m)
    assert 'Dave Beckett' in [
        t for (o, r, t, a) in m.match(NT_SPEC, DC_CREATOR)
    ]
    assert 'Art Barstow' in [
        t for (o, r, t, a) in m.match(NT_SPEC, DC_CREATOR)
    ]
    assert W3C in [t for (o, r, t, a) in m.match(NT_SPEC, DC_PUBLISHER)]
Exemplo n.º 11
0
def main(source):
    'Transform CSV SOURCE file to Schema.org in Versa'
    ppl = csv_schema_pipeline()
    input_model = newmodel()
    with open(source) as csvfp:
        csv.parse(csvfp, VLITERATE_TEMPLATE, input_model)

    # Debug print of input model
    # literate.write([input_model], out=sys.stdout)
    output_model = ppl.run(input_model=input_model)
    print('Resulting record Fingerprints:', ppl.fingerprints)
    print('Low level JSON dump of output data model: ')
    util.jsondump(output_model, sys.stdout)
    print('Versa literate form of output: ')
    literate.write(output_model, out=sys.stdout)
Exemplo n.º 12
0
def test_basics_1(testresourcepath, expected_modout1):
    modin = newmodel()
    modin_fpath = 'schemaorg/catcherintherye-ugly.md'
    literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin)

    FINGERPRINT_RULES = {
        SCH_NS('Book'): ( 
            materialize(BF_NS('Instance'),
                fprint=[
                    (BF_NS('isbn'), follow(SCH_NS('isbn'))),
                ],
            )
        )
    }

    TRANSFORM_RULES = {
        SCH_NS('name'): link(rel=BF_NS('name')),

        SCH_NS('author'): materialize(BF_NS('Person'),
                                    BF_NS('creator'),
                                    vars={
                                        'birthDate': follow(SCH_NS('authorBirthDate'),
                                            origin=var('input-resource'))
                                    },
                                    fprint=[
                                        (BF_NS('name'), target()),
                                        (BF_NS('birthDate'), var('birthDate')),
                                    ],
                                    links=[
                                        (BF_NS('name'), target()),
                                        (BF_NS('birthDate'), var('birthDate')),
                                    ]
        ),
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES)

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('='*10, 'test_basics_1', '='*10)
    literate.write(modout)

    assert len(modout) == 8
    assert len(list(util.all_origins(modout, only_types={BF_NS('Instance')}))) == 1
    assert len(list(util.all_origins(modout, only_types={BF_NS('Person')}))) == 1
    assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
Exemplo n.º 13
0
def expected_modout1():
    modout = newmodel()
    #literate.parse('''

    #''', modout)
    return modout
Exemplo n.º 14
0
def test_basics_2(testresourcepath):
    modin = newmodel()
    modin_fpath = 'schemaorg/catcherintherye-ugly.md'
    literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin)

    FINGERPRINT_RULES = {
        SCH_NS('Book'): ( 
            materialize(var('itype'),
                fprint=[
                    (BF_NS('isbn'), follow(SCH_NS('isbn'))),
                ],
                links=[
                    (BF_NS('instantiates'),
                        materialize(BF_NS('Work'),
                            fprint=[
                                (BF_NS('name'), follow(SCH_NS('title'))),
                                (BF_NS('creator'), follow(SCH_NS('author'))),
                                (BF_NS('language'), var('lang')),
                            ],
                            links=[('http://instantiated-by', var('@stem'))],
                            attach=False # Can remove when we have smart sessions to avoid duplicate instantiates links
                        ),
                    )
                ],
                # Not really necessary; just testing vars in this scenario
                vars={
                    'lang': follow(SCH_NS('inLanguage')),
                    'itype': BF_NS('Instance')
                    }
            )
        )
    }

    TRANSFORM_RULES = {
        # Rule for output resource type of Work or Instance
        (SCH_NS('name'), WT, IT): link(rel=BF_NS('name')),

        # Rule only for output resource type of Work
        (SCH_NS('author'), WT): materialize(BF_NS('Person'),
                                    BF_NS('creator'),
                                    vars={
                                        'birthDate': follow(SCH_NS('authorBirthDate'),
                                            origin=var('input-resource'))
                                    },
                                    fprint=[
                                        # Supplementary type
                                        (VTYPE_REL, SCH_NS('Novelist')),
                                        (BF_NS('name'), target()),
                                        (BF_NS('birthDate'), var('birthDate')),
                                    ],
                                    links=[
                                        # Supplementary type
                                        (VTYPE_REL, SCH_NS('Novelist')),
                                        (BF_NS('name'), target()),
                                        (BF_NS('birthDate'), var('birthDate')),
                                    ],
                                    preserve_fprint=True,
        ),
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES)

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('='*10, 'test_basics_2', '='*10)
    literate.write(modout)
    #import pprint; pprint.pprint(list(iter(modout)))

    assert len(modout) == 15
    assert len(list(util.all_origins(modout, only_types={BF_NS('Instance')}))) == 1
    assert len(list(util.all_origins(modout, only_types={BF_NS('Work')}))) == 1
    assert len(list(util.all_origins(modout, only_types={BF_NS('Person')}))) == 1
    assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
Exemplo n.º 15
0
def test_basics_4(testresourcepath):
    '''
    Convert from schema.org to [MusicBrainz scheme](https://musicbrainz.org/doc/MusicBrainz_Database/Schema)
    '''
    import sys # Uncomment to debug
    MB_NS = I('https://musicbrainz.org/doc/MusicBrainz_Database/Schema/')
    R_TYP = MB_NS('Release')
    RG_TYP = MB_NS('ReleaseGroup')
    A_TYP = MB_NS('Artist')
    DOC_NS = I('http://example.org/records/')

    modin = newmodel()
    modin_fpath = 'schemaorg/blackstar.md'
    literate.parse(open(os.path.join(testresourcepath, modin_fpath)).read(), modin)
    # Hand-add a comment property to the Mos Def resource to test that this value doesn't bleed e.g. to Kweli's output
    modin.add(DOC_NS('md'), SCH_NS('comment'), 'test')

    FINGERPRINT_RULES = {
        SCH_NS('MusicAlbum'): ( 
            materialize(MB_NS('ReleaseGroup'),
                fprint=[
                    (MB_NS('title'), follow(SCH_NS('name'))),
                    (MB_NS('artist'), follow(SCH_NS('byArtist'), SCH_NS('name'))),
                ],
                links=[
                    (MB_NS('contains'), materialize(MB_NS('Release'),
                        fprint=[
                            (MB_NS('catalogue-number'), var('catnum')),
                        ],
                        links=[
                            (MB_NS('catalogue-number'), var('catnum')),
                        ]
                    ))
                ],
                vars={'catnum': follow(SCH_NS('catalogNumber'))},
                # debug=sys.stderr, # Uncomment to debug
            )
        ),

        SCH_NS('Person'): ( 
            materialize(MB_NS('Artist'),
                fprint=[
                    (MB_NS('name'), var('aname')),
                ],
                links=[
                    (MB_NS('name'), var('aname')),
                    (MB_NS('remark'), var('comment')),
                ],
                vars={'aname': follow(SCH_NS('name')), 'comment': follow(SCH_NS('comment'))},
            )
        )
    }

    TRANSFORM_RULES = {
        (SCH_NS('name'), R_TYP, RG_TYP): link(rel=MB_NS('title')),

        (SCH_NS('byArtist'), R_TYP): link(rel=MB_NS('by'), target=lookup('@resource')),
    }

    # Intentionally shadows the global LABELIZE_RULES
    LABELIZE_RULES = {
        MB_NS('ReleaseGroup'): follow(MB_NS('title')),
        MB_NS('Release'): follow(MB_NS('title')),
        MB_NS('Artist'): follow(MB_NS('name'))
    }

    ppl = generic_pipeline(FINGERPRINT_RULES, TRANSFORM_RULES, LABELIZE_RULES)

    modout = ppl.run(input_model=modin)
    # Use -s to see this
    print('='*10, 'test_basics_4', '='*10)
    literate.write(modout)
    # import pprint; pprint.pprint(list(iter(modout)))

    assert len(modout) == 16
    assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1
    assert len(list(util.all_origins(modout, only_types={MB_NS('ReleaseGroup')}))) == 1
    assert len(list(util.all_origins(modout, only_types={MB_NS('Artist')}))) == 2
    # assert len(list(modout.match(None, BF_NS('birthDate'), '1919-01-01'))) == 1
    # DOC_NS('md') -> I('i5GvPVm7ClA') in the transform
    assert [ l[0] for l in modout.match(None, MB_NS('remark'), 'test')] == [I('i5GvPVm7ClA')]
Exemplo n.º 16
0
    def labelize(self):
        '''
        Executes a utility rule to create labels in output model for new resources
        '''
        # XXX Check if there's already a label?
        # Apply a common transform strategy using rules defined above
        def missed_label(origin, type):
            '''
            Callback to handle cases where a transform wasn't found to match a link (by relationship) in the input model
            '''
            warnings.warn(f'No label generated for: {origin}')
        labels = self.labelize_helper(LABELIZE_RULES, handle_misses=missed_label)
        return True


if __name__ == '__main__':
    for rec in INPUT_RECORDS:
        ppl = dc_schema_pipeline()
        input_model = newmodel()
        literate.parse(rec, input_model)
        output_model = ppl.run(input_model=input_model)
        print('Resulting record Fingerprints:', ppl.fingerprints)
        print('Low level JSON dump of output data model: ')
        util.jsondump(output_model, sys.stdout)
        print('Versa literate form of output: ')
        literate.write(output_model, out=sys.stdout)
        # from versa.serial import mermaid
        # print('Mermaid diagram form of output: ')
        # mermaid.write(output_model, out=sys.stdout)