def test_all_with_pipeline_data(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller ) # Clean run pipeline_controller.execute() # Test slim profiles slim_profiles = data_controller.api_data.get_profiles_slim() self.assertEqual(len(slim_profiles), 19) # Test fields fields = data_controller.api_data.get_fields() self.assertEqual(len(fields), 14)
def test_all_with_pipeline_data(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController( crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run pipeline_controller.execute() # Test slim profiles slim_profiles = data_controller.api_data.get_profiles_slim() self.assertEqual(len(slim_profiles), 19) # Test fields fields = data_controller.api_data.get_fields() self.assertEqual(len(fields), 14)
def test_execute(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController( crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run shall not crash pipeline_controller.execute() # Second run shall not crash either pipeline_controller.execute() # Third run shall not crash either pipeline_controller.execute()
def test_execute(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = FileCrawler() crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller ) # Clean run shall not crash pipeline_controller.execute() # Second run shall not crash either pipeline_controller.execute() # Third run shall not crash either pipeline_controller.execute()
def sample_pipeline(app_id=None, app_secret=None): from mendeleycache.crawler.sdk_crawler import SDKCrawler sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = None if app_id is None and app_secret is None: crawler = FileCrawler() else: crawler = SDKCrawler(app_id=app_id, app_secret=app_secret) crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) # Clean run shall not crash pipeline_controller.execute() rows = data_controller.engine.execute("SELECT * FROM profile").fetchall() print() print("Profiles:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_profile").fetchall() print() print("Cache profiles:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT id, owner_mendeley_id, title, authors, tags FROM document" ).fetchall() print() print("Documents:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_document").fetchall() print() print("Cache documents:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_field").fetchall() print() print("Cache fields:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_document_has_cache_field").fetchall() print() print("LINK: Cache document -> Cache field:") for row in rows: print(row) rows = data_controller.engine.execute( "SELECT * FROM cache_profile_has_cache_document").fetchall() print() print("LINK: Cache profile -> Cache document:") for row in rows: print(row) print()
else: from mendeleycache.crawler.sdk_crawler import SDKCrawler log.info("Pipeline uses SDKCrawler".format( app_id=config.crawler.app_id, app_secret=config.crawler.app_secret)) crawler = SDKCrawler(app_id=config.crawler.app_id, app_secret=config.crawler.app_secret) crawl_controller = CrawlController(crawler, config.crawler.research_group) analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller) pipeline_controller.execute() # Show file-crawler sample data elif command == "sample-file-pipeline": sample_pipeline() # Trigger the pipeline with the mendeley sdk crawler elif command == "sample-sdk-pipeline": if not len(sys.argv) >= 4: log.critical( "Missing arguments: mendeleycache.runner sample-sdk-pipeline {app-id} {app-secret}" ) app_id = sys.argv[2] app_secret = sys.argv[3] os.environ["MC_CRAWLER"] = 'mendeley'
app_id=config.crawler.app_id, app_secret=config.crawler.app_secret )) crawler = SDKCrawler( app_id=config.crawler.app_id, app_secret=config.crawler.app_secret ) crawl_controller = CrawlController(crawler, config.crawler.research_group) analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller ) pipeline_controller.execute() # Show file-crawler sample data elif command == "sample-file-pipeline": sample_pipeline() # Trigger the pipeline with the mendeley sdk crawler elif command == "sample-sdk-pipeline": if not len(sys.argv) >= 4: log.critical("Missing arguments: mendeleycache.runner sample-sdk-pipeline {app-id} {app-secret}") app_id = sys.argv[2] app_secret = sys.argv[3] os.environ["MC_CRAWLER"] = 'mendeley' os.environ["MC_APP_ID"] = app_id os.environ["MC_APP_SECRET"] = app_secret
def sample_pipeline(app_id=None, app_secret=None): from mendeleycache.crawler.sdk_crawler import SDKCrawler sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() crawler = None if app_id is None and app_secret is None: crawler = FileCrawler() else: crawler = SDKCrawler(app_id=app_id, app_secret=app_secret) crawl_controller = CrawlController(crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc") analysis_controller = AnalysisController() pipeline_controller = PipelineController( data_controller=data_controller, crawl_controller=crawl_controller, analysis_controller=analysis_controller ) # Clean run shall not crash pipeline_controller.execute() rows = data_controller.engine.execute("SELECT * FROM profile").fetchall() print() print("Profiles:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_profile").fetchall() print() print("Cache profiles:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT id, owner_mendeley_id, title, authors, tags FROM document").fetchall() print() print("Documents:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_document").fetchall() print() print("Cache documents:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_field").fetchall() print() print("Cache fields:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_document_has_cache_field").fetchall() print() print("LINK: Cache document -> Cache field:") for row in rows: print(row) rows = data_controller.engine.execute("SELECT * FROM cache_profile_has_cache_document").fetchall() print() print("LINK: Cache profile -> Cache document:") for row in rows: print(row) print()