示例#1
0
    def test_all_with_pipeline_data(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(
            crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)

        # Clean run
        pipeline_controller.execute()

        # Test slim profiles
        slim_profiles = data_controller.api_data.get_profiles_slim()
        self.assertEqual(len(slim_profiles), 19)

        # Test fields
        fields = data_controller.api_data.get_fields()
        self.assertEqual(len(fields), 14)
示例#2
0
    def test_update_cache_fields(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_fields(dict())
        self.assertIsNone(r)

        field1 = CacheField(title="field 1", unified_title="field1")
        field2 = CacheField(title="field 2", unified_title="field2")
        unified_field_title_to_field = dict()
        unified_field_title_to_field["field1"] = field1
        unified_field_title_to_field["field2"] = field2

        data_controller.crawl_data.update_cache_fields(
            unified_field_title_to_field)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_field").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        id = generate_id('field1')
        row = data_controller.engine.execute("SELECT id, title "
                                             "FROM cache_field "
                                             "WHERE id='%s' " % id).fetchone()

        # Check first row
        self.assertEqual(row["id"], id)
        self.assertEqual(row['title'], "field 1")
示例#3
0
    def test_create_engine(self):
        sqlite_in_memory = SQLiteConfiguration("")

        engine = create_engine(sqlite_in_memory)
        self.assertIsNotNone(engine)

        # Option 1: Explicit connection
        conn = engine.connect()
        conn.execute("CREATE TABLE x (a INTEGER, b INTEGER)")
        conn.execute("INSERT INTO x (a, b) VALUES (1, 1)")
        conn.execute("INSERT INTO x (a, b) VALUES (2, 2)")
        result = conn.execute("SELECT x.a, x.b FROM x")
        conn.close()
        self.assertEqual(result.keys(), ["a", "b"])

        # Option 2: connect with statement
        with engine.connect() as conn:
            conn.execute("DELETE FROM x")
            conn.execute("INSERT INTO x (a, b) VALUES (42, 43)")
            results = conn.execute("SELECT x.a, x.b FROM x")
            for row in results:
                self.assertEqual(row['a'], 42)
                self.assertEqual(row['b'], 43)

        # Option 3: transaction with begin
        with engine.begin() as conn:
            conn.execute("DELETE FROM x")
            conn.execute("INSERT INTO x (a, b) VALUES (42, 43)")
            results = conn.execute("SELECT x.a, x.b FROM x")
            for row in results:
                self.assertEqual(row['a'], 42)
                self.assertEqual(row['b'], 43)
示例#4
0
    def test_get_profiles_slim(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        data_controller.api_data.get_profiles_slim()
示例#5
0
    def test_run_schema(self):
        sqlite_in_memory = SQLiteConfiguration("")
        ctrl = DataController(sqlite_in_memory)

        # First check that none of the tables exists
        self.assertFalse(ctrl.table_exists('profile'))
        self.assertFalse(ctrl.table_exists('document'))
        self.assertFalse(ctrl.table_exists('cache_profile'))
        self.assertFalse(ctrl.table_exists('cache_document'))
        self.assertFalse(ctrl.table_exists('cache_field'))
        self.assertFalse(ctrl.table_exists('cache_document_has_cache_field'))
        self.assertFalse(ctrl.table_exists('cache_profile_has_cache_document'))
        self.assertFalse(ctrl.table_exists('update_log'))

        # Create schema
        ctrl.run_schema()

        # After schema creation all tables need to exist
        self.assertTrue(ctrl.table_exists('profile'))
        self.assertTrue(ctrl.table_exists('document'))
        self.assertTrue(ctrl.table_exists('cache_profile'))
        self.assertTrue(ctrl.table_exists('cache_document'))
        self.assertTrue(ctrl.table_exists('cache_field'))
        self.assertTrue(ctrl.table_exists('cache_document_has_cache_field'))
        self.assertTrue(ctrl.table_exists('cache_profile_has_cache_document'))
        self.assertTrue(ctrl.table_exists('update_log'))
示例#6
0
    def test_link_fields_to_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.link_fields_to_documents(dict())
        self.assertIsNone(r)
示例#7
0
    def test_update_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_profiles([])
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"],
                         generate_id("hansmustermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Mustermann")

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "id2")
        self.assertEqual(rows[1]["cache_profile_id"],
                         generate_id("maxmustermann"))
        self.assertEqual(rows[1]['first_name'], "Max")
        self.assertEqual(rows[1]['last_name'], "Mustermann")

        profile1 = Profile("id1", "Hans", "Supermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"],
                         generate_id("hanssupermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Supermann")
示例#8
0
    def test_errors(self):
        sqlite_in_memory = SQLiteConfiguration("")
        ctrl = DataController(sqlite_in_memory)

        # Try completely dumb sql
        try:
            with ctrl.engine.connect() as conn:
                conn.execute("COMPLETELY WRONG COMMAND")
                self.fail("DBAPI exception not fired")
        except DBAPIError as e:
            pass
示例#9
0
    def test_get_documents_by_profile_ids_and_field_ids(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids([],
                                                                            [])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            [42, 43], [])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            [], [42, 43])
        data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            [42, 43], [44, 45])
示例#10
0
 def test_assert_drop(self):
     sqlite_in_memory = SQLiteConfiguration("")
     ctrl = DataController(sqlite_in_memory)
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
     ctrl.drop_all()
     self.assertFalse(ctrl.is_initialized())
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
     with ctrl.engine.begin() as conn:
         conn.execute("DROP TABLE cache_document")
     self.assertFalse(ctrl.is_initialized())
     ctrl.assert_schema()
     self.assertTrue(ctrl.is_initialized())
示例#11
0
    def test_update_cache_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_profiles(dict())
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        profile3 = Profile("id3", "Hans", "Mustermann", "", "")

        unified_name_to_profiles = dict()
        unified_name_to_profiles["hansmustermann"] = [profile1, profile3]
        unified_name_to_profiles["maxmustermann"] = [profile2]

        data_controller.crawl_data.update_cache_profiles(
            unified_name_to_profiles)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query same title row
        id = generate_id('hansmustermann')
        row = data_controller.engine.execute("SELECT id, profile_id, name "
                                             "FROM cache_profile "
                                             "WHERE id='%s'" % id).fetchone()

        self.assertEqual(row["id"], id)
        self.assertEqual(row["name"], "Hans Mustermann")

        # Now update profiles and check if the link is set correctly
        data_controller.crawl_data.update_profiles(
            [profile1, profile2, profile3])
        row = data_controller.engine.execute(
            "SELECT cp.name as name "
            "FROM cache_profile cp, profile p "
            "WHERE cp.profile_id = p.id "
            "AND cp.id='%s' " % id).fetchone()
        self.assertEqual(row["name"], "Hans Mustermann")
示例#12
0
    def test_execute(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        crawler = FileCrawler()
        crawl_controller = CrawlController(
            crawler, "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

        analysis_controller = AnalysisController()

        pipeline_controller = PipelineController(
            data_controller=data_controller,
            crawl_controller=crawl_controller,
            analysis_controller=analysis_controller)

        # Clean run shall not crash
        pipeline_controller.execute()

        # Second run shall not crash either
        pipeline_controller.execute()

        # Third run shall not crash either
        pipeline_controller.execute()
示例#13
0
    def test_update_cache_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_documents(dict())
        self.assertIsNone(r)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="sametitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="Older Abtract",
                             core_source="Older source",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document3 = Document(core_id="doc3",
                             core_profile_id="id3",
                             core_title="sametitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="Newer abstract",
                             core_source="Newer source",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        unified_document_title_to_documents = dict()
        unified_document_title_to_documents["samtetitle1"] = [
            document1, document3
        ]
        unified_document_title_to_documents["title2"] = [document2]

        # Trigger cache document update
        data_controller.crawl_data.update_cache_documents(
            unified_document_title_to_documents)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query sametitle row
        id = generate_id('sametitle1')
        row = data_controller.engine.execute("SELECT id, document_id, title "
                                             "FROM cache_document "
                                             "WHERE id='%s'" % id).fetchone()
        self.assertEqual(row["id"], id)
        self.assertEqual(row["title"], "sametitle1")

        # Now update documents and check if the link is set correctly
        data_controller.crawl_data.update_documents(
            [document1, document2, document3])
        row = data_controller.engine.execute(
            "SELECT cd.title as title "
            "FROM cache_document cd, document d "
            "WHERE cd.document_id = d.id "
            "AND cd.id='%s' " % id).fetchone()
        self.assertEqual(row["title"], "sametitle1")
示例#14
0
    def test_update_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_documents([])
        self.assertIsNone(r)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="title1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla",
                             core_source="ACM xy",
                             core_year=2015,
                             core_authors=[("Hans", "Mustermann"),
                                           ("Nicht", "Existent")],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=["t ag- 1"])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"], generate_id("title1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "title1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blabla")
        self.assertEqual(rows[0]["source"], "ACM xy")
        self.assertEqual(rows[0]["pub_year"], 2015)

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "doc2")
        self.assertEqual(rows[1]["cache_document_id"], generate_id("title2"))
        self.assertEqual(rows[1]["owner_mendeley_id"], "id2")
        self.assertEqual(rows[1]['title'], "title2")
        self.assertEqual(rows[1]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[1]['abstract'], "blabla2")
        self.assertEqual(rows[1]["source"], "ACM xyz")
        self.assertEqual(rows[1]["pub_year"], 2014)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="newtitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blablaNew",
                             core_source="ACM xyz1",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])

        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"],
                         generate_id("newtitle1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "newtitle1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blablaNew")
        self.assertEqual(rows[0]["source"], "ACM xyz1")
        self.assertEqual(rows[0]["pub_year"], 2015)
示例#15
0
def sample_pipeline(app_id=None, app_secret=None):
    from mendeleycache.crawler.sdk_crawler import SDKCrawler

    sqlite_in_memory = SQLiteConfiguration("")
    data_controller = DataController(sqlite_in_memory)
    data_controller.run_schema()

    crawler = None
    if app_id is None and app_secret is None:
        crawler = FileCrawler()
    else:
        crawler = SDKCrawler(app_id=app_id, app_secret=app_secret)

    crawl_controller = CrawlController(crawler,
                                       "d0b7f41f-ad37-3b47-ab70-9feac35557cc")

    analysis_controller = AnalysisController()

    pipeline_controller = PipelineController(
        data_controller=data_controller,
        crawl_controller=crawl_controller,
        analysis_controller=analysis_controller)

    # Clean run shall not crash
    pipeline_controller.execute()

    rows = data_controller.engine.execute("SELECT * FROM profile").fetchall()
    print()
    print("Profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_profile").fetchall()
    print()
    print("Cache profiles:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT id, owner_mendeley_id, title, authors, tags FROM document"
    ).fetchall()
    print()
    print("Documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_document").fetchall()
    print()
    print("Cache documents:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_field").fetchall()
    print()
    print("Cache fields:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_document_has_cache_field").fetchall()
    print()
    print("LINK: Cache document -> Cache field:")
    for row in rows:
        print(row)

    rows = data_controller.engine.execute(
        "SELECT * FROM cache_profile_has_cache_document").fetchall()
    print()
    print("LINK: Cache profile -> Cache document:")
    for row in rows:
        print(row)

    print()
示例#16
0
 def test_is_initialized(self):
     sqlite_in_memory = SQLiteConfiguration("")
     ctrl = DataController(sqlite_in_memory)
     ctrl.run_schema()
     self.assertTrue(ctrl.is_initialized())