def __init__(self, *args, **kwargs):
        super(MendeleyCache, self).__init__(*args, **kwargs)

        # Read configuration
        self.configuration = ServiceConfiguration()
        self.configuration.load()
        log.info("Configuration has been loaded")

        # Create service controllers
        self.data_controller = DataController(self.configuration.database)
        self.data_controller.assert_schema()
        log.info("Schema has been checked")

        # Create crawler based on configuration
        self.crawler = None
        """:type : AbstractCrawler"""
        if not self.configuration.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            self.crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler
            log.info("Pipeline uses SDKCrawler".format(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret))
            self.crawler = SDKCrawler(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret)

        # Create the pipeline
        self.crawl_controller = CrawlController(
            self.crawler, self.configuration.crawler.research_group)
        self.analysis_controller = AnalysisController()
        self.pipeline_controller = PipelineController(
            data_controller=self.data_controller,
            crawl_controller=self.crawl_controller,
            analysis_controller=self.analysis_controller)
        log.info("Pipeline has been initialized")

        # Create the routing controllers
        self.fields_controller = FieldsController(self, self.data_controller)
        self.profiles_controller = ProfilesController(self,
                                                      self.data_controller,
                                                      self.configuration.cache)
        self.publications_controller = DocumentsController(
            self, self.data_controller)
        self.cache_controller = CacheController(self, self.data_controller,
                                                self.pipeline_controller,
                                                self.configuration)
        self.root_controller = RootController(self, self.data_controller,
                                              self.configuration)

        # Register the routes
        self.register_routes()
        log.info("Routes have been registered")
        log.info("MendeleyCache has been initialized")
Пример #2
0
    def test_service_configuration_load(self):
        # then test a valid mysql configuration
        config = ServiceConfiguration()
        TestServiceConfiguration.write_valid_1()
        try:
            config.load()
            self.assertEqual(config.crawler.app_id, "231209")
            self.assertEqual(config.crawler.app_secret, "AlPhA4NuMeRiC20")
            self.assertEqual(config.crawler.research_group,
                             "d0b7f41f-ad37-3b47-ab70-9feac35557cc")
            self.assertEqual(config.cache.profile_page_pattern,
                             "www1.in.tum.de/:firstname-:lastname")
        except InvalidConfigurationException as e:
            self.fail(e)

        # then test a valid sqlite configuration
        config = ServiceConfiguration()
        TestServiceConfiguration.write_valid_2()
        try:
            config.load()
            self.assertEqual(config.crawler.app_id, "231209")
            self.assertEqual(config.crawler.app_secret, "AlPhA4NuMeRiC20")
            self.assertEqual(config.crawler.research_group,
                             "d0b7f41f-ad37-3b47-ab70-9feac35557cc")
            self.assertEqual(config.database.engine, "sqlite")
            self.assertEqual(config.database.path, "")
            self.assertEqual(config.cache.profile_page_pattern, "")
        except InvalidConfigurationException as e:
            self.fail(e)
Пример #3
0
    def __init__(self, *args, **kwargs):
        super(MendeleyCache, self).__init__(*args, **kwargs)

        # Read configuration
        self.configuration = ServiceConfiguration()
        self.configuration.load()
        log.info("Configuration has been loaded")

        # Create service controllers
        self.data_controller = DataController(self.configuration.database)
        self.data_controller.assert_schema()
        log.info("Schema has been checked")

        # Create crawler based on configuration
        self.crawler = None
        """:type : AbstractCrawler"""
        if not self.configuration.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            self.crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler

            log.info(
                "Pipeline uses SDKCrawler".format(
                    app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret
                )
            )
            self.crawler = SDKCrawler(
                app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret
            )

        # Create the pipeline
        self.crawl_controller = CrawlController(self.crawler, self.configuration.crawler.research_group)
        self.analysis_controller = AnalysisController()
        self.pipeline_controller = PipelineController(
            data_controller=self.data_controller,
            crawl_controller=self.crawl_controller,
            analysis_controller=self.analysis_controller,
        )
        log.info("Pipeline has been initialized")

        # Create the routing controllers
        self.fields_controller = FieldsController(self, self.data_controller)
        self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache)
        self.publications_controller = DocumentsController(self, self.data_controller)
        self.cache_controller = CacheController(
            self, self.data_controller, self.pipeline_controller, self.configuration
        )
        self.root_controller = RootController(self, self.data_controller, self.configuration)

        # Register the routes
        self.register_routes()
        log.info("Routes have been registered")
        log.info("MendeleyCache has been initialized")
Пример #4
0
        project_root = get_relative_path(".")

        # Prepare
        loader = TestLoader()
        runner = unittest.TextTestRunner(verbosity=2)

        # Create suites
        all = loader.discover(start_dir=project_root)

        # Run suites
        runner.run(all)

    elif command == "prepare":
        log.info("Preparing environment for gunicorn workers")
        # Read configuration
        configuration = ServiceConfiguration()
        configuration.load()
        log.info("Configuration has been loaded")

        # Create data controller and assert schema
        # That will remove the race conditions of the gunicorn worker if it's done on every startup
        data_controller = DataController(configuration.database)
        data_controller.assert_schema()

    # Pipeline runner
    elif command == "pipeline":
        config = ServiceConfiguration()
        config.load()

        data_controller = DataController(config.database)
        if not data_controller.is_initialized():
Пример #5
0
        project_root = get_relative_path(".")

        # Prepare
        loader = TestLoader()
        runner = unittest.TextTestRunner(verbosity=2)

        # Create suites
        all = loader.discover(start_dir=project_root)

        # Run suites
        runner.run(all)

    elif command == "prepare":
        log.info("Preparing environment for gunicorn workers")
        # Read configuration
        configuration = ServiceConfiguration()
        configuration.load()
        log.info("Configuration has been loaded")

        # Create data controller and assert schema
        # That will remove the race conditions of the gunicorn worker if it's done on every startup
        data_controller = DataController(configuration.database)
        data_controller.assert_schema()

    # Pipeline runner
    elif command == "pipeline":
        config = ServiceConfiguration()
        config.load()

        data_controller = DataController(config.database)
        if not data_controller.is_initialized():