示例#1
0
    def test_process_profiles(self):
        ctrl = AnalysisController()
        ctrl.prepare(self.profiles, {}, [])
        ctrl.process_profiles()

        self.assertEqual(len(ctrl.unified_name_to_profiles), 3)

        # check that all unified names are stored as keys
        self.assertIn("hansmustermann", ctrl.unified_name_to_profiles)
        self.assertIn("maxmustermann", ctrl.unified_name_to_profiles)
        self.assertIn("heinrichmustermann", ctrl.unified_name_to_profiles)

        # check that the profiles are stored correctly
        self.assertEqual(ctrl.unified_name_to_profiles["hansmustermann"][0], self.profile1)
        self.assertEqual(ctrl.unified_name_to_profiles["hansmustermann"][1], self.profile4)
        self.assertEqual(ctrl.unified_name_to_profiles["maxmustermann"][0], self.profile2)
        self.assertEqual(ctrl.unified_name_to_profiles["heinrichmustermann"][0], self.profile3)

        # check that the document sets are created
        self.assertIn("hansmustermann", ctrl.unified_name_to_authored_documents)
        self.assertIn("hansmustermann", ctrl.unified_name_to_participated_documents)
        self.assertIn("maxmustermann", ctrl.unified_name_to_authored_documents)
        self.assertIn("maxmustermann", ctrl.unified_name_to_participated_documents)
        self.assertIn("heinrichmustermann", ctrl.unified_name_to_authored_documents)
        self.assertIn("heinrichmustermann", ctrl.unified_name_to_participated_documents)
示例#2
0
    def test_process_group_documents(self):
        ctrl = AnalysisController()
        ctrl.prepare(self.profiles, {}, self.group_documents)
        ctrl.process_profiles()
        ctrl.process_group_documents()

        self.assert_participations(ctrl)
示例#3
0
    def test_all_with_pipeline_data(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(
            crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)

        # Clean run
        pipeline_controller.execute()

        # Test slim profiles
        slim_profiles = data_controller.api_data.get_profiles_slim()
        self.assertEqual(len(slim_profiles), 19)

        # Test fields
        fields = data_controller.api_data.get_fields()
        self.assertEqual(len(fields), 14)
示例#4
0
    def test_process_group_documents(self):
        ctrl = AnalysisController()
        ctrl.prepare(self.profiles, {}, self.group_documents)
        ctrl.process_profiles()
        ctrl.process_group_documents()

        self.assert_participations(ctrl)
    def __init__(self, *args, **kwargs):
        super(MendeleyCache, self).__init__(*args, **kwargs)

        # Read configuration
        self.configuration = ServiceConfiguration()
        self.configuration.load()
        log.info("Configuration has been loaded")

        # Create service controllers
        self.data_controller = DataController(self.configuration.database)
        self.data_controller.assert_schema()
        log.info("Schema has been checked")

        # Create crawler based on configuration
        self.crawler = None
        """:type : AbstractCrawler"""
        if not self.configuration.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            self.crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler
            log.info("Pipeline uses SDKCrawler".format(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret))
            self.crawler = SDKCrawler(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret)

        # Create the pipeline
        self.crawl_controller = CrawlController(
            self.crawler, self.configuration.crawler.research_group)
        self.analysis_controller = AnalysisController()
        self.pipeline_controller = PipelineController(
            data_controller=self.data_controller,
            crawl_controller=self.crawl_controller,
            analysis_controller=self.analysis_controller)
        log.info("Pipeline has been initialized")

        # Create the routing controllers
        self.fields_controller = FieldsController(self, self.data_controller)
        self.profiles_controller = ProfilesController(self,
                                                      self.data_controller,
                                                      self.configuration.cache)
        self.publications_controller = DocumentsController(
            self, self.data_controller)
        self.cache_controller = CacheController(self, self.data_controller,
                                                self.pipeline_controller,
                                                self.configuration)
        self.root_controller = RootController(self, self.data_controller,
                                              self.configuration)

        # Register the routes
        self.register_routes()
        log.info("Routes have been registered")
        log.info("MendeleyCache has been initialized")
示例#6
0
    def test_process_profiles(self):
        ctrl = AnalysisController()
        ctrl.prepare(self.profiles, {}, [])
        ctrl.process_profiles()

        self.assertEqual(len(ctrl.unified_name_to_profiles), 3)

        # check that all unified names are stored as keys
        self.assertIn("hansmustermann", ctrl.unified_name_to_profiles)
        self.assertIn("maxmustermann", ctrl.unified_name_to_profiles)
        self.assertIn("heinrichmustermann", ctrl.unified_name_to_profiles)

        # check that the profiles are stored correctly
        self.assertEqual(ctrl.unified_name_to_profiles["hansmustermann"][0],
                         self.profile1)
        self.assertEqual(ctrl.unified_name_to_profiles["hansmustermann"][1],
                         self.profile4)
        self.assertEqual(ctrl.unified_name_to_profiles["maxmustermann"][0],
                         self.profile2)
        self.assertEqual(
            ctrl.unified_name_to_profiles["heinrichmustermann"][0],
            self.profile3)

        # check that the document sets are created
        self.assertIn("hansmustermann",
                      ctrl.unified_name_to_authored_documents)
        self.assertIn("hansmustermann",
                      ctrl.unified_name_to_participated_documents)
        self.assertIn("maxmustermann", ctrl.unified_name_to_authored_documents)
        self.assertIn("maxmustermann",
                      ctrl.unified_name_to_participated_documents)
        self.assertIn("heinrichmustermann",
                      ctrl.unified_name_to_authored_documents)
        self.assertIn("heinrichmustermann",
                      ctrl.unified_name_to_participated_documents)
示例#7
0
    def test_process_profile_documents(self):
        ctrl = AnalysisController()
        ctrl.prepare(self.profiles, self.profile_documents, [])
        ctrl.process_profiles()
        ctrl.process_profile_documents()

        # Check if authored_documents are set correctly
        self.assertEqual(len(ctrl.unified_name_to_authored_documents["hansmustermann"]), 1)
        self.assertEqual(len(ctrl.unified_name_to_authored_documents["maxmustermann"]), 1)
        self.assertEqual(len(ctrl.unified_name_to_authored_documents["heinrichmustermann"]), 1)
        self.assertIn("title1", ctrl.unified_name_to_authored_documents["hansmustermann"])
        self.assertIn("title2", ctrl.unified_name_to_authored_documents["maxmustermann"])
        self.assertIn("title3", ctrl.unified_name_to_authored_documents["heinrichmustermann"])

        self.assert_participations(ctrl)
示例#8
0
    def test_process_profile_documents(self):
        ctrl = AnalysisController()
        ctrl.prepare(self.profiles, self.profile_documents, [])
        ctrl.process_profiles()
        ctrl.process_profile_documents()

        # Check if authored_documents are set correctly
        self.assertEqual(
            len(ctrl.unified_name_to_authored_documents["hansmustermann"]), 1)
        self.assertEqual(
            len(ctrl.unified_name_to_authored_documents["maxmustermann"]), 1)
        self.assertEqual(
            len(ctrl.unified_name_to_authored_documents["heinrichmustermann"]),
            1)
        self.assertIn(
            "title1",
            ctrl.unified_name_to_authored_documents["hansmustermann"])
        self.assertIn("title2",
                      ctrl.unified_name_to_authored_documents["maxmustermann"])
        self.assertIn(
            "title3",
            ctrl.unified_name_to_authored_documents["heinrichmustermann"])

        self.assert_participations(ctrl)
示例#9
0
    def test_execute(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(
            crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)

        # Clean run shall not crash
        pipeline_controller.execute()

        # Second run shall not crash either
        pipeline_controller.execute()

        # Third run shall not crash either
        pipeline_controller.execute()
示例#10
0
    def test_analyze_author(self):
        ctrl = AnalysisController()
        ctrl.prepare(self.profiles, {}, [])
        ctrl.process_profiles()

        # Find an existing profile as author of a doc
        self.assertEqual(len(ctrl.unified_name_to_participated_documents["hansmustermann"]), 0)
        ctrl.analyze_author("doc1", ("Hans", "Mustermann"))
        self.assertEqual(len(ctrl.unified_name_to_participated_documents["hansmustermann"]), 1)
        self.assertIn("doc1", ctrl.unified_name_to_participated_documents["hansmustermann"])

        # Find the same profile in another doc
        ctrl.analyze_author("doc2", ("Hans", "Mustermann"))
        self.assertEqual(len(ctrl.unified_name_to_participated_documents["hansmustermann"]), 2)
        self.assertIn("doc2", ctrl.unified_name_to_participated_documents["hansmustermann"])

        # Authored docs are still 0
        self.assertEqual(len(ctrl.unified_name_to_authored_documents["hansmustermann"]), 0)

        # Find an unknown profile as author of a doc
        self.assertNotIn("nichtexistent", ctrl.unified_name_to_participated_documents)
        ctrl.analyze_author("doc1", ("Nicht", "Existent"))
        self.assertEqual(len(ctrl.unified_name_to_participated_documents["nichtexistent"]), 1)
        self.assertIn("nichtexistent", ctrl.unified_name_to_unknown_profile)
        self.assertEqual(ctrl.unified_name_to_unknown_profile["nichtexistent"].name, "Nicht Existent")
        self.assertEqual(ctrl.unified_name_to_unknown_profile["nichtexistent"].unified_name, "nichtexistent")
示例#11
0
    def test_analyze_field_tag(self):
        ctrl = AnalysisController()
        ctrl.prepare([], {}, [])
        ctrl.analyze_field_tag("docid1", "t ag-1")

        # Check if CacheField for tag1 was created
        self.assertTrue(len(ctrl.unified_field_title_to_field), 1)
        self.assertEqual("T Ag-1", ctrl.unified_field_title_to_field["tag1"].title)
        self.assertEqual("tag1", ctrl.unified_field_title_to_field["tag1"].unified_title)
        # Check if document docid1 has been added to tag1
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 1)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 1)
        self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"])

        # Now add the same tag for the same document (written slightly different)
        ctrl.analyze_field_tag("docid1", "t ag - 1")
        self.assertTrue(len(ctrl.unified_field_title_to_field), 1)
        self.assertEqual("T Ag - 1", ctrl.unified_field_title_to_field["tag1"].title)
        self.assertEqual("tag1", ctrl.unified_field_title_to_field["tag1"].unified_title)
        # Check if document docid1 is still the only doc
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 1)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 1)
        self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"])

        # Now add a new document for the same tag (but different field name)
        ctrl.analyze_field_tag("docid2", "t -ag - 1")
        self.assertTrue(len(ctrl.unified_field_title_to_field), 1)
        self.assertEqual("T -Ag - 1", ctrl.unified_field_title_to_field["tag1"].title)
        self.assertEqual("tag1", ctrl.unified_field_title_to_field["tag1"].unified_title)
        # Check if document docid2 is now linked with tag 1
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 1)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 2)
        self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"])
        self.assertIn("docid2", ctrl.unified_field_title_to_documents["tag1"])

        # Now add an old document with a new tag
        ctrl.analyze_field_tag("docid2", "t ag - 2")
        self.assertTrue(len(ctrl.unified_field_title_to_field), 2)
        self.assertEqual("T Ag - 2", ctrl.unified_field_title_to_field["tag2"].title)
        self.assertEqual("tag2", ctrl.unified_field_title_to_field["tag2"].unified_title)
        # Check if document docid2 is now linked with tag 2
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 2)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag2"]), 1)
        self.assertIn("docid2", ctrl.unified_field_title_to_documents["tag2"])

        # Now add a new document with a new tag
        ctrl.analyze_field_tag("docid3", "t ag - 3")
        self.assertTrue(len(ctrl.unified_field_title_to_field), 3)
        self.assertEqual("T Ag - 3", ctrl.unified_field_title_to_field["tag3"].title)
        self.assertEqual("tag3", ctrl.unified_field_title_to_field["tag3"].unified_title)
        # Check if document docid3 is now linked with tag 3
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 3)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag3"]), 1)
        self.assertIn("docid3", ctrl.unified_field_title_to_documents["tag3"])
示例#12
0
def sample_pipeline(app_id=None, app_secret=None):
    from mendeleycache.crawler.sdk_crawler import SDKCrawler

    sqlite_in_memory = SQLiteConfiguration("")
    data_controller = DataController(sqlite_in_memory)
    data_controller.run_schema()

    crawler = None
    if app_id is None and app_secret is None:
        crawler = FileCrawler()
    else:
        crawler = SDKCrawler(app_id=app_id, app_secret=app_secret)

    crawl_controller = CrawlController(crawler,
                                       "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

    analysis_controller = AnalysisController()

    pipeline_controller = PipelineController(
        data_controller=data_controller,
        crawl_controller=crawl_controller,
        analysis_controller=analysis_controller)

    # Clean run shall not crash
    pipeline_controller.execute()

    rows = data_controller.engine.execute("SELECT * FROM profile").fetchall()
    print()
    print("Profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_profile").fetchall()
    print()
    print("Cache profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT id, owner_mendeley_id, title, authors, tags FROM document"
    ).fetchall()
    print()
    print("Documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_document").fetchall()
    print()
    print("Cache documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_field").fetchall()
    print()
    print("Cache fields:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_document_has_cache_field").fetchall()
    print()
    print("LINK: Cache document -> Cache field:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_profile_has_cache_document").fetchall()
    print()
    print("LINK: Cache profile -> Cache document:")
    for row in rows:
        print(row)

    print()
示例#13
0
        crawler = None
        if not config.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler
            log.info("Pipeline uses SDKCrawler".format(
                app_id=config.crawler.app_id,
                app_secret=config.crawler.app_secret))
            crawler = SDKCrawler(app_id=config.crawler.app_id,
                                 app_secret=config.crawler.app_secret)

        crawl_controller = CrawlController(crawler,
                                           config.crawler.research_group)
        analysis_controller = AnalysisController()
        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)
        pipeline_controller.execute()

    # Show file-crawler sample data
    elif command == "sample-file-pipeline":
        sample_pipeline()

    # Trigger the pipeline with the mendeley sdk crawler
    elif command == "sample-sdk-pipeline":
        if not len(sys.argv) >= 4:
            log.critical(
                "Missing arguments: mendeleycache.runner sample-sdk-pipeline {app-id} {app-secret}"
示例#14
0
    def test_analyze_author(self):
        ctrl = AnalysisController()
        ctrl.prepare(self.profiles, {}, [])
        ctrl.process_profiles()

        # Find an existing profile as author of a doc
        self.assertEqual(
            len(ctrl.unified_name_to_participated_documents["hansmustermann"]),
            0)
        ctrl.analyze_author("doc1", ("Hans", "Mustermann"))
        self.assertEqual(
            len(ctrl.unified_name_to_participated_documents["hansmustermann"]),
            1)
        self.assertIn(
            "doc1",
            ctrl.unified_name_to_participated_documents["hansmustermann"])

        # Find the same profile in another doc
        ctrl.analyze_author("doc2", ("Hans", "Mustermann"))
        self.assertEqual(
            len(ctrl.unified_name_to_participated_documents["hansmustermann"]),
            2)
        self.assertIn(
            "doc2",
            ctrl.unified_name_to_participated_documents["hansmustermann"])

        # Authored docs are still 0
        self.assertEqual(
            len(ctrl.unified_name_to_authored_documents["hansmustermann"]), 0)

        # Find an unknown profile as author of a doc
        self.assertNotIn("nichtexistent",
                         ctrl.unified_name_to_participated_documents)
        ctrl.analyze_author("doc1", ("Nicht", "Existent"))
        self.assertEqual(
            len(ctrl.unified_name_to_participated_documents["nichtexistent"]),
            1)
        self.assertIn("nichtexistent", ctrl.unified_name_to_unknown_profile)
        self.assertEqual(
            ctrl.unified_name_to_unknown_profile["nichtexistent"].name,
            "Nicht Existent")
        self.assertEqual(
            ctrl.unified_name_to_unknown_profile["nichtexistent"].unified_name,
            "nichtexistent")
示例#15
0
    def test_analyze_field_tag(self):
        ctrl = AnalysisController()
        ctrl.prepare([], {}, [])
        ctrl.analyze_field_tag("docid1", "t ag-1")

        # Check if CacheField for tag1 was created
        self.assertTrue(len(ctrl.unified_field_title_to_field), 1)
        self.assertEqual("T Ag-1",
                         ctrl.unified_field_title_to_field["tag1"].title)
        self.assertEqual(
            "tag1", ctrl.unified_field_title_to_field["tag1"].unified_title)
        # Check if document docid1 has been added to tag1
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 1)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 1)
        self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"])

        # Now add the same tag for the same document (written slightly different)
        ctrl.analyze_field_tag("docid1", "t ag - 1")
        self.assertTrue(len(ctrl.unified_field_title_to_field), 1)
        self.assertEqual("T Ag - 1",
                         ctrl.unified_field_title_to_field["tag1"].title)
        self.assertEqual(
            "tag1", ctrl.unified_field_title_to_field["tag1"].unified_title)
        # Check if document docid1 is still the only doc
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 1)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 1)
        self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"])

        # Now add a new document for the same tag (but different field name)
        ctrl.analyze_field_tag("docid2", "t -ag - 1")
        self.assertTrue(len(ctrl.unified_field_title_to_field), 1)
        self.assertEqual("T -Ag - 1",
                         ctrl.unified_field_title_to_field["tag1"].title)
        self.assertEqual(
            "tag1", ctrl.unified_field_title_to_field["tag1"].unified_title)
        # Check if document docid2 is now linked with tag 1
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 1)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag1"]), 2)
        self.assertIn("docid1", ctrl.unified_field_title_to_documents["tag1"])
        self.assertIn("docid2", ctrl.unified_field_title_to_documents["tag1"])

        # Now add an old document with a new tag
        ctrl.analyze_field_tag("docid2", "t ag - 2")
        self.assertTrue(len(ctrl.unified_field_title_to_field), 2)
        self.assertEqual("T Ag - 2",
                         ctrl.unified_field_title_to_field["tag2"].title)
        self.assertEqual(
            "tag2", ctrl.unified_field_title_to_field["tag2"].unified_title)
        # Check if document docid2 is now linked with tag 2
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 2)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag2"]), 1)
        self.assertIn("docid2", ctrl.unified_field_title_to_documents["tag2"])

        # Now add a new document with a new tag
        ctrl.analyze_field_tag("docid3", "t ag - 3")
        self.assertTrue(len(ctrl.unified_field_title_to_field), 3)
        self.assertEqual("T Ag - 3",
                         ctrl.unified_field_title_to_field["tag3"].title)
        self.assertEqual(
            "tag3", ctrl.unified_field_title_to_field["tag3"].unified_title)
        # Check if document docid3 is now linked with tag 3
        self.assertTrue(len(ctrl.unified_field_title_to_documents), 3)
        self.assertTrue(len(ctrl.unified_field_title_to_documents["tag3"]), 1)
        self.assertIn("docid3", ctrl.unified_field_title_to_documents["tag3"])