def test_all_with_pipeline_data(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController( crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run pipeline_controller.execute() # Test slim profiles slim_profiles = data_controller.api_data.get_profiles_slim() self.assertEqual(len(slim_profiles), 19) # Test fields fields = data_controller.api_data.get_fields() self.assertEqual(len(fields), 14)
def test_process_group_documents(self): ctrl = AnalysisController() ctrl.prepare(self.profiles, {}, self.group_documents) ctrl.process_profiles() ctrl.process_group_documents() self.assert_participations(ctrl)
def test_process_profiles(self): ctrl = AnalysisController() ctrl.prepare(self.profiles, {}, []) ctrl.process_profiles() self.assertEqual(len(ctrl.unified_name_to_profiles), 3) # check that all unified names are stored as keys self.assertIn("hansmustermann", ctrl.unified_name_to_profiles) self.assertIn("maxmustermann", ctrl.unified_name_to_profiles) self.assertIn("heinrichmustermann", ctrl.unified_name_to_profiles) # check that the profiles are stored correctly self.assertEqual(ctrl.unified_name_to_profiles["hansmustermann"][0], self.profile1) self.assertEqual(ctrl.unified_name_to_profiles["hansmustermann"][1], self.profile4) self.assertEqual(ctrl.unified_name_to_profiles["maxmustermann"][0], self.profile2) self.assertEqual( ctrl.unified_name_to_profiles["heinrichmustermann"][0], self.profile3) # check that the document sets are created self.assertIn("hansmustermann", ctrl.unified_name_to_authored_documents) self.assertIn("hansmustermann", ctrl.unified_name_to_participated_documents) self.assertIn("maxmustermann", ctrl.unified_name_to_authored_documents) self.assertIn("maxmustermann", ctrl.unified_name_to_participated_documents) self.assertIn("heinrichmustermann", ctrl.unified_name_to_authored_documents) self.assertIn("heinrichmustermann", ctrl.unified_name_to_participated_documents)
def __init__(self, *args, **kwargs): super(MendeleyCache, self).__init__(*args, **kwargs) # Read configuration self.configuration = ServiceConfiguration() self.configuration.load() log.info("Configuration has been loaded") # Create service controllers self.data_controller = DataController(self.configuration.database) self.data_controller.assert_schema() log.info("Schema has been checked") # Create crawler based on configuration self.crawler = None """:type : AbstractCrawler""" if not self.configuration.uses_mendeley: log.info("Pipeline uses FileCrawler") self.crawler = FileCrawler() else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info("Pipeline uses SDKCrawler".format( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret)) self.crawler = SDKCrawler( app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret) # Create the pipeline self.crawl_controller = CrawlController( self.crawler, self.configuration.crawler.research_group) self.analysis_controller = AnalysisController() self.pipeline_controller = PipelineController( data_controller=self.data_controller, crawl_controller=self.crawl_controller, analysis_controller=self.analysis_controller) log.info("Pipeline has been initialized") # Create the routing controllers self.fields_controller = FieldsController(self, self.data_controller) self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache) self.publications_controller = DocumentsController( self, self.data_controller) self.cache_controller = CacheController(self, self.data_controller, self.pipeline_controller, self.configuration) self.root_controller = RootController(self, self.data_controller, self.configuration) # Register the routes self.register_routes() log.info("Routes have been registered") log.info("MendeleyCache has been initialized")
def test_analyze_author(self): ctrl = AnalysisController() ctrl.prepare(self.profiles, {}, []) ctrl.process_profiles() # Find an existing profile as author of a doc self.assertEqual( len(ctrl.unified_name_to_participated_documents["hansmustermann"]), 0) ctrl.analyze_author("doc1", ("Hans", "Mustermann")) self.assertEqual( len(ctrl.unified_name_to_participated_documents["hansmustermann"]), 1) self.assertIn( "doc1", ctrl.unified_name_to_participated_documents["hansmustermann"]) # Find the same profile in another doc ctrl.analyze_author("doc2", ("Hans", "Mustermann")) self.assertEqual( len(ctrl.unified_name_to_participated_documents["hansmustermann"]), 2) self.assertIn( "doc2", ctrl.unified_name_to_participated_documents["hansmustermann"]) # Authored docs are still 0 self.assertEqual( len(ctrl.unified_name_to_authored_documents["hansmustermann"]), 0) # Find an unknown profile as author of a doc self.assertNotIn("nichtexistent", ctrl.unified_name_to_participated_documents) ctrl.analyze_author("doc1", ("Nicht", "Existent")) self.assertEqual( len(ctrl.unified_name_to_participated_documents["nichtexistent"]), 1) self.assertIn("nichtexistent", ctrl.unified_name_to_unknown_profile) self.assertEqual( ctrl.unified_name_to_unknown_profile["nichtexistent"].name, "Nicht Existent") self.assertEqual( ctrl.unified_name_to_unknown_profile["nichtexistent"].unified_name, "nichtexistent")
def test_execute(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController( crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run shall not crash pipeline_controller.execute() # Second run shall not crash either pipeline_controller.execute() # Third run shall not crash either pipeline_controller.execute()
def test_process_profile_documents(self): ctrl = AnalysisController() ctrl.prepare(self.profiles, self.profile_documents, []) ctrl.process_profiles() ctrl.process_profile_documents() # Check if authored_documents are set correctly self.assertEqual( len(ctrl.unified_name_to_authored_documents["hansmustermann"]), 1) self.assertEqual( len(ctrl.unified_name_to_authored_documents["maxmustermann"]), 1) self.assertEqual( len(ctrl.unified_name_to_authored_documents["heinrichmustermann"]), 1) self.assertIn( "title1", ctrl.unified_name_to_authored_documents["hansmustermann"]) self.assertIn("title2", ctrl.unified_name_to_authored_documents["maxmustermann"]) self.assertIn( "title3", ctrl.unified_name_to_authored_documents["heinrichmustermann"]) self.assert_participations(ctrl)
def sample_pipeline(app_id=None, app_secret=None): from mendeleycache.crawler.sdk_crawler import SDKCrawler sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = None if app_id is None and app_secret is None: crawler = FileCrawler() else: crawler = SDKCrawler(app_id=app_id, app_secret=app_secret) crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run shall not crash pipeline_controller.execute() rows = data_controller.engine.execute("SELECT * FROM profile").fetchall() print() print("Profiles:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_profile").fetchall() print() print("Cache profiles:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT id, owner_mendeley_id, title, authors, tags FROM document" ).fetchall() print() print("Documents:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_document").fetchall() print() print("Cache documents:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_field").fetchall() print() print("Cache fields:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_document_has_cache_field").fetchall() print() print("LINK: Cache document -> Cache field:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_profile_has_cache_document").fetchall() print() print("LINK: Cache profile -> Cache document:") for row in rows: print(row) print()
crawler = None if not config.uses_mendeley: log.info("Pipeline uses FileCrawler") crawler = FileCrawler() else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info("Pipeline uses SDKCrawler".format( app_id=config.crawler.app_id, app_secret=config.crawler.app_secret)) crawler = SDKCrawler(app_id=config.crawler.app_id, app_secret=config.crawler.app_secret) crawl_controller = CrawlController(crawler, config.crawler.research_group) analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) pipeline_controller.execute() # Show file-crawler sample data elif command == "sample-file-pipeline": sample_pipeline() # Trigger the pipeline with the mendeley sdk crawler elif command == "sample-sdk-pipeline": if not len(sys.argv) >= 4: log.critical( "Missing arguments: mendeleycache.runner sample-sdk-pipeline {app-id} {app-secret}"
def test_analyze_field_tag(self): ctrl = AnalysisController() ctrl.prepare([], {}, []) ctrl.analyze_field_tag("docid1", "t ag-1") # Check if CacheField for tag1 was created self.assertTrue(len(ctrl.unified_field_title_to_field), 1) self.assertEqual("T Ag-1", ctrl.unified_field_title_to_field["tag1"].title) self.assertEqual( "tag1", ctrl.unified_field_title_to_field["tag1"].unified_title) # Check if document docid1 has been added to tag1 self.assertTrue(len(ctrl.unified_field_title_to_documents), 1) self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 1) self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"]) # Now add the same tag for the same document (written slightly different) ctrl.analyze_field_tag("docid1", "t ag - 1") self.assertTrue(len(ctrl.unified_field_title_to_field), 1) self.assertEqual("T Ag - 1", ctrl.unified_field_title_to_field["tag1"].title) self.assertEqual( "tag1", ctrl.unified_field_title_to_field["tag1"].unified_title) # Check if document docid1 is still the only doc self.assertTrue(len(ctrl.unified_field_title_to_documents), 1) self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 1) self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"]) # Now add a new document for the same tag (but different field name) ctrl.analyze_field_tag("docid2", "t -ag - 1") self.assertTrue(len(ctrl.unified_field_title_to_field), 1) self.assertEqual("T -Ag - 1", ctrl.unified_field_title_to_field["tag1"].title) self.assertEqual( "tag1", ctrl.unified_field_title_to_field["tag1"].unified_title) # Check if document docid2 is now linked with tag 1 self.assertTrue(len(ctrl.unified_field_title_to_documents), 1) self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 2) self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"]) self.assertIn("docid2", ctrl.unified_field_title_to_documents["tag1"]) # Now add an old document with a new tag ctrl.analyze_field_tag("docid2", "t ag - 2") self.assertTrue(len(ctrl.unified_field_title_to_field), 2) self.assertEqual("T Ag - 2", ctrl.unified_field_title_to_field["tag2"].title) self.assertEqual( "tag2", ctrl.unified_field_title_to_field["tag2"].unified_title) # Check if document docid2 is now linked with tag 2 self.assertTrue(len(ctrl.unified_field_title_to_documents), 2) self.assertTrue(len(ctrl.unified_field_title_to_documents["tag2"]), 1) self.assertIn("docid2", ctrl.unified_field_title_to_documents["tag2"]) # Now add a new document with a new tag ctrl.analyze_field_tag("docid3", "t ag - 3") self.assertTrue(len(ctrl.unified_field_title_to_field), 3) self.assertEqual("T Ag - 3", ctrl.unified_field_title_to_field["tag3"].title) self.assertEqual( "tag3", ctrl.unified_field_title_to_field["tag3"].unified_title) # Check if document docid3 is now linked with tag 3 self.assertTrue(len(ctrl.unified_field_title_to_documents), 3) self.assertTrue(len(ctrl.unified_field_title_to_documents["tag3"]), 1) self.assertIn("docid3", ctrl.unified_field_title_to_documents["tag3"])