def index_stream(collection_name, profile, shards, after, solr='http://localhost:8983/solr/'): """ Listens to the Wikidata edit stream and updates a collection according to the given indexing profile. """ tagger = TaggerFactory(solr) indexing_profile = IndexingProfile.load(profile) try: tagger.create_collection(collection_name, num_shards=shards, configset=indexing_profile.solrconfig) except CollectionAlreadyExists: pass if after is not None: after = dateutil.parser.parse(after) stream = WikidataStreamReader(from_time=after) tagger.index_stream(collection_name, stream, indexing_profile, batch_size=50, commit_time=1, delete_excluded=True)
def index_dump(collection_name, filename, profile, shards, skip, solr='http://localhost:8983/solr/'): """ Indexes a Wikidata dump in a new Solr collection with the given name. """ tagger = TaggerFactory(solr) indexing_profile = IndexingProfile.load(profile) try: tagger.create_collection(collection_name, num_shards=shards, configset=indexing_profile.solrconfig) except CollectionAlreadyExists: pass dump = WikidataDumpReader(filename) tagger.index_stream(collection_name, dump, indexing_profile, batch_size=2000, commit_time=10, delete_excluded=False, skip_docs=skip)
def index_sparql(collection_name, sparql_query_file, profile, shards, solr='http://localhost:8983/solr/'): """ Indexes the results of a SPARQL query which contains an "item" variable pointing to items to index """ tagger = TaggerFactory(solr) indexing_profile = IndexingProfile.load(profile) try: tagger.create_collection(collection_name, num_shards=shards, configset=indexing_profile.solrconfig) except CollectionAlreadyExists: pass with open(sparql_query_file, 'r') as f: query = f.read() query_results = SparqlReader(query) tagger.index_stream(collection_name, query_results, indexing_profile, batch_size=50, commit_time=10, delete_excluded=False)
def test_load_indexing_profile(testdir, expected_json): indexing_profile = IndexingProfile.load( os.path.join(testdir, 'data', 'indexing_profile.json')) assert indexing_profile.language == 'en' assert indexing_profile.name == 'affiliations' assert indexing_profile.restrict_properties == ['P2427', 'P1566', 'P496'] assert indexing_profile.json() == expected_json
def test_all_items_profile(testdir): profile_filename = os.path.join(testdir, 'data/all_items_profile.json') profile = IndexingProfile.load(profile_filename) type_matcher = TypeMatcherStub() dump_filename = os.path.join(testdir, 'data/sample_wikidata_items.json.bz2') with WikidataDumpReader(dump_filename) as reader: for item in reader: assert profile.entity_to_document(item, type_matcher) is not None
def setUpClass(cls): cls.testdir = os.path.dirname(os.path.abspath(__file__)) cls.solr_endpoint = 'http://localhost:8983/solr/' cls.tf = TaggerFactory(cls.solr_endpoint) # Load dummy profile cls.profile = IndexingProfile.load(os.path.join(cls.testdir, 'data/all_items_profile.json')) # Skip entire test if solr is not running try: r = requests.get(cls.solr_endpoint) except requests.exceptions.RequestException: raise unittest.SkipTest('Solr is not running')
def setUpClass(cls): cls.testdir = os.path.dirname(os.path.abspath(__file__)) # Load dummy bow bow_fname = os.path.join(cls.testdir, 'data/sample_bow.pkl') cls.bow = BOWLanguageModel() cls.bow.load(bow_fname) # Load dummy graph graph_fname = os.path.join(cls.testdir, 'data/sample_wikidata_items.npz') pagerank_fname = os.path.join(cls.testdir, 'data/sample_wikidata_items.pgrank.npy') cls.graph = WikidataGraph() cls.graph.load_from_matrix(graph_fname) cls.graph.load_pagerank(pagerank_fname) # Load dummy profile cls.profile = IndexingProfile.load( os.path.join(cls.testdir, 'data/all_items_profile.json')) # Setup solr index (TODO delete this) and tagger cls.tf = TaggerFactory() cls.collection_name = 'wd_test_collection' try: cls.tf.create_collection(cls.collection_name) except CollectionAlreadyExists: pass cls.tf.index_stream( cls.collection_name, WikidataDumpReader( os.path.join(cls.testdir, 'data/sample_wikidata_items.json.bz2')), cls.profile) cls.tagger = Tagger(cls.collection_name, cls.bow, cls.graph) # Load NIF dataset cls.nif = NIFCollection.load( os.path.join(cls.testdir, 'data/five-affiliations.ttl')) cls.classifier = SimpleTagClassifier(cls.tagger, max_similarity_distance=10, similarity_smoothing=2)
def setUpClass(cls): super(TaggerTest, cls).tearDownClass() testdir = os.path.dirname(os.path.abspath(__file__)) # Load dummy bow bow_fname = os.path.join(testdir, 'data/sample_bow.pkl') cls.bow = BOWLanguageModel() cls.bow.load(bow_fname) # Load dummy graph graph_fname = os.path.join(testdir, 'data/sample_wikidata_items.npz') pagerank_fname = os.path.join(testdir, 'data/sample_wikidata_items.pgrank.npy') cls.graph = WikidataGraph() cls.graph.load_from_matrix(graph_fname) cls.graph.load_pagerank(pagerank_fname) # Load indexing profile cls.profile = IndexingProfile.load( os.path.join(testdir, 'data/all_items_profile.json')) # Setup solr index cls.tf = TaggerFactory() cls.collection_name = 'wd_test_collection' try: cls.tf.delete_collection('wd_test_collection') except requests.exceptions.RequestException: pass cls.tf.create_collection(cls.collection_name) cls.tf.index_stream( 'wd_test_collection', WikidataDumpReader( os.path.join(testdir, 'data/sample_wikidata_items.json.bz2')), cls.profile) cls.sut = Tagger(cls.collection_name, cls.bow, cls.graph)
def sample_profile(testdir): return IndexingProfile.load( os.path.join(testdir, 'data', 'indexing_profile.json'))
def fallback_if_unsupported_language(language): if IndexingProfile.is_language_supported(language): return language else: return 'en'