예제 #1
0
def index_stream(collection_name,
                 profile,
                 shards,
                 after,
                 solr='http://localhost:8983/solr/'):
    """
    Listens to the Wikidata edit stream and updates a collection according to
    the given indexing profile.
    """
    tagger = TaggerFactory(solr)
    indexing_profile = IndexingProfile.load(profile)
    try:
        tagger.create_collection(collection_name,
                                 num_shards=shards,
                                 configset=indexing_profile.solrconfig)
    except CollectionAlreadyExists:
        pass
    if after is not None:
        after = dateutil.parser.parse(after)
    stream = WikidataStreamReader(from_time=after)
    tagger.index_stream(collection_name,
                        stream,
                        indexing_profile,
                        batch_size=50,
                        commit_time=1,
                        delete_excluded=True)
예제 #2
0
def index_dump(collection_name,
               filename,
               profile,
               shards,
               skip,
               solr='http://localhost:8983/solr/'):
    """
    Indexes a Wikidata dump in a new Solr collection with the given name.
    """
    tagger = TaggerFactory(solr)
    indexing_profile = IndexingProfile.load(profile)
    try:
        tagger.create_collection(collection_name,
                                 num_shards=shards,
                                 configset=indexing_profile.solrconfig)
    except CollectionAlreadyExists:
        pass
    dump = WikidataDumpReader(filename)
    tagger.index_stream(collection_name,
                        dump,
                        indexing_profile,
                        batch_size=2000,
                        commit_time=10,
                        delete_excluded=False,
                        skip_docs=skip)
예제 #3
0
def index_sparql(collection_name,
                 sparql_query_file,
                 profile,
                 shards,
                 solr='http://localhost:8983/solr/'):
    """
    Indexes the results of a SPARQL query which contains an "item" variable pointing to items to index
    """
    tagger = TaggerFactory(solr)
    indexing_profile = IndexingProfile.load(profile)
    try:
        tagger.create_collection(collection_name,
                                 num_shards=shards,
                                 configset=indexing_profile.solrconfig)
    except CollectionAlreadyExists:
        pass
    with open(sparql_query_file, 'r') as f:
        query = f.read()
    query_results = SparqlReader(query)
    tagger.index_stream(collection_name,
                        query_results,
                        indexing_profile,
                        batch_size=50,
                        commit_time=10,
                        delete_excluded=False)
예제 #4
0
def test_load_indexing_profile(testdir, expected_json):
    indexing_profile = IndexingProfile.load(
        os.path.join(testdir, 'data', 'indexing_profile.json'))

    assert indexing_profile.language == 'en'
    assert indexing_profile.name == 'affiliations'
    assert indexing_profile.restrict_properties == ['P2427', 'P1566', 'P496']
    assert indexing_profile.json() == expected_json
예제 #5
0
def test_all_items_profile(testdir):
    profile_filename = os.path.join(testdir, 'data/all_items_profile.json')
    profile = IndexingProfile.load(profile_filename)
    type_matcher = TypeMatcherStub()
    dump_filename = os.path.join(testdir,
                                 'data/sample_wikidata_items.json.bz2')
    with WikidataDumpReader(dump_filename) as reader:
        for item in reader:
            assert profile.entity_to_document(item, type_matcher) is not None
예제 #6
0
 def setUpClass(cls):
     cls.testdir = os.path.dirname(os.path.abspath(__file__))
     cls.solr_endpoint = 'http://localhost:8983/solr/'
     cls.tf = TaggerFactory(cls.solr_endpoint)
     
     # Load dummy profile
     cls.profile = IndexingProfile.load(os.path.join(cls.testdir, 'data/all_items_profile.json'))
     
     # Skip entire test if solr is not running
     try:
         r = requests.get(cls.solr_endpoint)
     except requests.exceptions.RequestException:
         raise unittest.SkipTest('Solr is not running')
예제 #7
0
    def setUpClass(cls):
        cls.testdir = os.path.dirname(os.path.abspath(__file__))

        # Load dummy bow
        bow_fname = os.path.join(cls.testdir, 'data/sample_bow.pkl')
        cls.bow = BOWLanguageModel()
        cls.bow.load(bow_fname)

        # Load dummy graph
        graph_fname = os.path.join(cls.testdir,
                                   'data/sample_wikidata_items.npz')
        pagerank_fname = os.path.join(cls.testdir,
                                      'data/sample_wikidata_items.pgrank.npy')
        cls.graph = WikidataGraph()
        cls.graph.load_from_matrix(graph_fname)
        cls.graph.load_pagerank(pagerank_fname)

        # Load dummy profile
        cls.profile = IndexingProfile.load(
            os.path.join(cls.testdir, 'data/all_items_profile.json'))

        # Setup solr index (TODO delete this) and tagger
        cls.tf = TaggerFactory()
        cls.collection_name = 'wd_test_collection'
        try:
            cls.tf.create_collection(cls.collection_name)
        except CollectionAlreadyExists:
            pass
        cls.tf.index_stream(
            cls.collection_name,
            WikidataDumpReader(
                os.path.join(cls.testdir,
                             'data/sample_wikidata_items.json.bz2')),
            cls.profile)
        cls.tagger = Tagger(cls.collection_name, cls.bow, cls.graph)

        # Load NIF dataset
        cls.nif = NIFCollection.load(
            os.path.join(cls.testdir, 'data/five-affiliations.ttl'))

        cls.classifier = SimpleTagClassifier(cls.tagger,
                                             max_similarity_distance=10,
                                             similarity_smoothing=2)
예제 #8
0
    def setUpClass(cls):
        super(TaggerTest, cls).tearDownClass()
        testdir = os.path.dirname(os.path.abspath(__file__))

        # Load dummy bow
        bow_fname = os.path.join(testdir, 'data/sample_bow.pkl')
        cls.bow = BOWLanguageModel()
        cls.bow.load(bow_fname)

        # Load dummy graph
        graph_fname = os.path.join(testdir, 'data/sample_wikidata_items.npz')
        pagerank_fname = os.path.join(testdir,
                                      'data/sample_wikidata_items.pgrank.npy')
        cls.graph = WikidataGraph()
        cls.graph.load_from_matrix(graph_fname)
        cls.graph.load_pagerank(pagerank_fname)

        # Load indexing profile
        cls.profile = IndexingProfile.load(
            os.path.join(testdir, 'data/all_items_profile.json'))

        # Setup solr index
        cls.tf = TaggerFactory()
        cls.collection_name = 'wd_test_collection'
        try:
            cls.tf.delete_collection('wd_test_collection')
        except requests.exceptions.RequestException:
            pass
        cls.tf.create_collection(cls.collection_name)
        cls.tf.index_stream(
            'wd_test_collection',
            WikidataDumpReader(
                os.path.join(testdir, 'data/sample_wikidata_items.json.bz2')),
            cls.profile)

        cls.sut = Tagger(cls.collection_name, cls.bow, cls.graph)
예제 #9
0
def sample_profile(testdir):
    return IndexingProfile.load(
        os.path.join(testdir, 'data', 'indexing_profile.json'))
예제 #10
0
 def fallback_if_unsupported_language(language):
     if IndexingProfile.is_language_supported(language):
         return language
     else:
         return 'en'