Пример #1
0
    def test_update_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_profiles([])
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"],
                         generate_id("hansmustermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Mustermann")

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "id2")
        self.assertEqual(rows[1]["cache_profile_id"],
                         generate_id("maxmustermann"))
        self.assertEqual(rows[1]['first_name'], "Max")
        self.assertEqual(rows[1]['last_name'], "Mustermann")

        profile1 = Profile("id1", "Hans", "Supermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"],
                         generate_id("hanssupermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Supermann")
Пример #2
0
    def test_update_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_profiles([])
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile "
        ).fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"], generate_id("hansmustermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Mustermann")

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "id2")
        self.assertEqual(rows[1]["cache_profile_id"], generate_id("maxmustermann"))
        self.assertEqual(rows[1]['first_name'], "Max")
        self.assertEqual(rows[1]['last_name'], "Mustermann")

        profile1 = Profile("id1", "Hans", "Supermann", "", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        data_controller.crawl_data.update_profiles([profile1, profile2])

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name "
            "FROM profile "
        ).fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "id1")
        self.assertEqual(rows[0]["cache_profile_id"], generate_id("hanssupermann"))
        self.assertEqual(rows[0]['first_name'], "Hans")
        self.assertEqual(rows[0]['last_name'], "Supermann")
Пример #3
0
    def update_cache_documents(self, unified_document_title_to_documents: {}):
        """
    Given a unified_document_title to documents map, merges the documents and creates the FK references
    :param unified_document_title_to_documents:
    :return:
    """

        sql = self._update_cache_documents[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache documents")
            for _, doc_list in unified_document_title_to_documents.items():
                # flatten the document list down to one document
                reference_doc = None
                """:type : Document"""

                for doc in doc_list:
                    if reference_doc is None or doc.core_last_modified > reference_doc.core_last_modified:
                        reference_doc = doc

                # if we found at least one reference_doc (which we should),
                # add the corresponding sql insert string to the cache_document_strings array
                if reference_doc is not None:
                    u, r = unify_document_title(reference_doc.core_title)
                    b64u = generate_id(u)
                    conn.execute(sql, (b64u, sanitize_text(r)))

        log.info("Cache documents have been updated")
Пример #4
0
    def test_update_cache_fields(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_fields(dict())
        self.assertIsNone(r)

        field1 = CacheField(title="field 1", unified_title="field1")
        field2 = CacheField(title="field 2", unified_title="field2")
        unified_field_title_to_field = dict()
        unified_field_title_to_field["field1"] = field1
        unified_field_title_to_field["field2"] = field2

        data_controller.crawl_data.update_cache_fields(unified_field_title_to_field)

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_field").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        id = generate_id('field1')
        row = data_controller.engine.execute(
            "SELECT id, title "
            "FROM cache_field "
            "WHERE id='%s' " % id
        ).fetchone()

        # Check first row
        self.assertEqual(row["id"], id)
        self.assertEqual(row['title'], "field 1")
Пример #5
0
    def test_update_cache_fields(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_fields(dict())
        self.assertIsNone(r)

        field1 = CacheField(title="field 1", unified_title="field1")
        field2 = CacheField(title="field 2", unified_title="field2")
        unified_field_title_to_field = dict()
        unified_field_title_to_field["field1"] = field1
        unified_field_title_to_field["field2"] = field2

        data_controller.crawl_data.update_cache_fields(
            unified_field_title_to_field)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_field").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        id = generate_id('field1')
        row = data_controller.engine.execute("SELECT id, title "
                                             "FROM cache_field "
                                             "WHERE id='%s' " % id).fetchone()

        # Check first row
        self.assertEqual(row["id"], id)
        self.assertEqual(row['title'], "field 1")
Пример #6
0
    def update_cache_profiles(self, unified_name_to_profiles: {}):
        """
    Given a unified_profile_name to profiles map, merges the profiles and creates the FK references
    :param unified_name_to_profiles:
    :param unified_name_to_real_name:
    :return:
    """

        sql = self._update_cache_profiles[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache profiles")
            for _, profile_list in unified_name_to_profiles.items():
                # flatten the profile list down to one profile
                reference_profile = None
                """:type : Profile"""

                for profile in profile_list:
                    if reference_profile is None or len(profile.display_name) > len(reference_profile.display_name):
                        reference_profile = profile

                # if we found at least one reference_profile (which we should)
                # add the corresponding sql insert string to the cache_profile_strings array
                if reference_profile is not None:
                    u, r = unify_profile_name(reference_profile.first_name, reference_profile.last_name)
                    b64u = generate_id(u)
                    log.info("inserting %s, %s" % (b64u, sanitize_text(r)))
                    conn.execute(sql, (b64u, sanitize_text(r)))

        log.info("Cache profiles have been updated")
Пример #7
0
    def link_fields_to_documents(self, unified_field_title_to_documents: {}):
        """
    Given a unified_field_title to documents map, creates the N:M relations in the database
    :param unified_field_title_to_documents:
    :return:
    """

        # Get the different statements in the sql file
        delete = self._link_fields_to_documents[0]
        insert = self._link_fields_to_documents[1]

        # Fire the sql scripts in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting previous field -> document links")
            conn.execute(delete)
            log.debug("Inserting new field -> document links")
            for unified_field_title, doc_list in unified_field_title_to_documents.items():
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(doc_unified), generate_id(unified_field_title)))

        log.info("Field -> document links have been updated")
Пример #8
0
    def link_profiles_to_documents(
        self,
        unified_name_to_profiles: {},
        unified_name_to_authored_documents: {},
        unified_name_to_participated_documents: {},
    ):
        """
    Given a unified_profile_name to authored_documents and participated_documents map(s), creates the N:M relations
    in the database
    :param unified_name_to_authored_documents:
    :param unified_name_to_participated_documents:
    :return:
    """

        # Get the different statements in the sql file
        delete = self._link_profiles_to_documents[0]
        insert = self._link_profiles_to_documents[1]

        # Fire the sql scripts in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting previous profile -> document links")
            conn.execute(delete)

            log.debug("Inserting new profile -> document links")

            for unified_name, doc_list in unified_name_to_authored_documents.items():
                # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise)
                if unified_name not in unified_name_to_profiles:
                    continue
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified)))

            for unified_name, doc_list in unified_name_to_participated_documents.items():
                # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise)
                if unified_name not in unified_name_to_profiles:
                    continue
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified)))

        log.info("Profile -> document links have been updated")
Пример #9
0
 def insert_profile(conn: Connection, insert: str, p: Profile):
     u, _ = unify_profile_name(p.first_name, p.last_name)
     b64u = generate_id(u)
     conn.execute(
         insert,
         (
             sanitize_text(p.identifier),
             b64u,
             sanitize_text(p.first_name),
             sanitize_text(p.last_name),
             sanitize_text(p.display_name),
             sanitize_text(p.link),
         ),
     )
Пример #10
0
    def update_cache_fields(self, unified_field_title_to_field: {}):
        """
    Given a unified_field_title to field map, updates the fields
    :param unified_field_title_to_field:
    :return:
    """

        sql = self._update_cache_fields[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache fields")
            for _, field in unified_field_title_to_field.items():
                b64u = generate_id(field.unified_title)
                conn.execute(sql, (b64u, sanitize_text(field.title)))

        log.info("Cache fields have been updated")
Пример #11
0
    def test_update_cache_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_profiles(dict())
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        profile3 = Profile("id3", "Hans", "Mustermann", "", "")

        unified_name_to_profiles = dict()
        unified_name_to_profiles["hansmustermann"] = [profile1, profile3]
        unified_name_to_profiles["maxmustermann"] = [profile2]

        data_controller.crawl_data.update_cache_profiles(
            unified_name_to_profiles)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query same title row
        id = generate_id('hansmustermann')
        row = data_controller.engine.execute("SELECT id, profile_id, name "
                                             "FROM cache_profile "
                                             "WHERE id='%s'" % id).fetchone()

        self.assertEqual(row["id"], id)
        self.assertEqual(row["name"], "Hans Mustermann")

        # Now update profiles and check if the link is set correctly
        data_controller.crawl_data.update_profiles(
            [profile1, profile2, profile3])
        row = data_controller.engine.execute(
            "SELECT cp.name as name "
            "FROM cache_profile cp, profile p "
            "WHERE cp.profile_id = p.id "
            "AND cp.id='%s' " % id).fetchone()
        self.assertEqual(row["name"], "Hans Mustermann")
Пример #12
0
    def test_update_cache_profiles(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_profiles(dict())
        self.assertIsNone(r)

        profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "")
        profile2 = Profile("id2", "Max", "Mustermann", "", "")
        profile3 = Profile("id3", "Hans", "Mustermann", "", "")

        unified_name_to_profiles = dict()
        unified_name_to_profiles["hansmustermann"] = [profile1, profile3]
        unified_name_to_profiles["maxmustermann"] = [profile2]

        data_controller.crawl_data.update_cache_profiles(unified_name_to_profiles)

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_profile").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query same title row
        id = generate_id('hansmustermann')
        row = data_controller.engine.execute(
            "SELECT id, profile_id, name "
            "FROM cache_profile "
            "WHERE id='%s'" % id
        ).fetchone()

        self.assertEqual(row["id"], id)
        self.assertEqual(row["name"], "Hans Mustermann")

        # Now update profiles and check if the link is set correctly
        data_controller.crawl_data.update_profiles([profile1, profile2, profile3])
        row = data_controller.engine.execute(
            "SELECT cp.name as name "
            "FROM cache_profile cp, profile p "
            "WHERE cp.profile_id = p.id "
            "AND cp.id='%s' " % id
        ).fetchone()
        self.assertEqual(row["name"], "Hans Mustermann")
Пример #13
0
        def insert_doc(conn: Connection, insert: str, doc: Document):
            u, _ = unify_document_title(doc.core_title)
            b64u = generate_id(u)
            author_string = map(lambda x: "{first} {last}".format(first=x[0], last=x[1]), doc.core_authors)

            # Create strings
            authors_string = ", ".join(author_string)
            keywords_string = ", ".join(doc.core_keywords)
            tags_string = ", ".join(doc.tags)

            # Create bibtex
            bibtex = generate_bibtex(doc)

            # Insert tuple
            conn.execute(
                insert,
                (
                    sanitize_text(doc.core_id),
                    b64u,
                    sanitize_text(doc.core_profile_id),
                    sanitize_text(doc.core_title),
                    sanitize_text(doc.core_type),
                    datetime_to_sqltime(doc.core_created),
                    datetime_to_sqltime(doc.core_last_modified),
                    sanitize_text(doc.core_abstract),
                    sanitize_text(doc.core_source),
                    doc.core_year,
                    sanitize_text(authors_string),
                    sanitize_text(keywords_string),
                    sanitize_text(tags_string),
                    sanitize_text(doc.doc_website),
                    sanitize_text(doc.conf_website),
                    doc.conf_month,
                    sanitize_text(doc.conf_pages),
                    sanitize_text(doc.conf_city),
                    sanitize_text(bibtex),
                ),
            )
Пример #14
0
    def test_update_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_documents([])
        self.assertIsNone(r)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="title1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla",
                             core_source="ACM xy",
                             core_year=2015,
                             core_authors=[("Hans", "Mustermann"),
                                           ("Nicht", "Existent")],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=["t ag- 1"])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"], generate_id("title1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "title1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blabla")
        self.assertEqual(rows[0]["source"], "ACM xy")
        self.assertEqual(rows[0]["pub_year"], 2015)

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "doc2")
        self.assertEqual(rows[1]["cache_document_id"], generate_id("title2"))
        self.assertEqual(rows[1]["owner_mendeley_id"], "id2")
        self.assertEqual(rows[1]['title'], "title2")
        self.assertEqual(rows[1]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[1]['abstract'], "blabla2")
        self.assertEqual(rows[1]["source"], "ACM xyz")
        self.assertEqual(rows[1]["pub_year"], 2014)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="newtitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blablaNew",
                             core_source="ACM xyz1",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])

        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document ").fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"],
                         generate_id("newtitle1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "newtitle1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blablaNew")
        self.assertEqual(rows[0]["source"], "ACM xyz1")
        self.assertEqual(rows[0]["pub_year"], 2015)
Пример #15
0
    def test_update_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_documents([])
        self.assertIsNone(r)

        document1 = Document(
            core_id="doc1",
            core_profile_id="id1",
            core_title="title1",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blabla",
            core_source="ACM xy",
            core_year=2015,
            core_authors=[("Hans", "Mustermann"), ("Nicht", "Existent")],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=["t ag- 1"]
        )
        document2 = Document(
            core_id="doc2",
            core_profile_id="id2",
            core_title="title2",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blabla2",
            core_source="ACM xyz",
            core_year=2014,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document "
        ).fetchall()

        # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"], generate_id("title1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "title1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blabla")
        self.assertEqual(rows[0]["source"], "ACM xy")
        self.assertEqual(rows[0]["pub_year"], 2015)

        # Check second row
        self.assertEqual(rows[1]["mendeley_id"], "doc2")
        self.assertEqual(rows[1]["cache_document_id"], generate_id("title2"))
        self.assertEqual(rows[1]["owner_mendeley_id"], "id2")
        self.assertEqual(rows[1]['title'], "title2")
        self.assertEqual(rows[1]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[1]['abstract'], "blabla2")
        self.assertEqual(rows[1]["source"], "ACM xyz")
        self.assertEqual(rows[1]["pub_year"], 2014)

        document1 = Document(
            core_id="doc1",
            core_profile_id="id1",
            core_title="newtitle1",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blablaNew",
            core_source="ACM xyz1",
            core_year=2015,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        document2 = Document(
            core_id="doc2",
            core_profile_id="id2",
            core_title="title2",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blabla2",
            core_source="ACM xyz",
            core_year=2014,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )

        data_controller.crawl_data.update_documents([document1, document2])

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query rows
        rows = data_controller.engine.execute(
            "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id,"
            " title, doc_type, created, last_modified, abstract, source, pub_year "
            "FROM document "
        ).fetchall()

         # Check first row
        self.assertEqual(rows[0]["mendeley_id"], "doc1")
        self.assertEqual(rows[0]["cache_document_id"], generate_id("newtitle1"))
        self.assertEqual(rows[0]["owner_mendeley_id"], "id1")
        self.assertEqual(rows[0]['title'], "newtitle1")
        self.assertEqual(rows[0]["doc_type"], "conference_proceedings")
        self.assertEqual(rows[0]['abstract'], "blablaNew")
        self.assertEqual(rows[0]["source"], "ACM xyz1")
        self.assertEqual(rows[0]["pub_year"], 2015)
Пример #16
0
    def test_update_cache_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_documents(dict())
        self.assertIsNone(r)

        document1 = Document(core_id="doc1",
                             core_profile_id="id1",
                             core_title="sametitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="Older Abtract",
                             core_source="Older source",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document2 = Document(core_id="doc2",
                             core_profile_id="id2",
                             core_title="title2",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="blabla2",
                             core_source="ACM xyz",
                             core_year=2014,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        document3 = Document(core_id="doc3",
                             core_profile_id="id3",
                             core_title="sametitle1",
                             core_type="conference_proceedings",
                             core_created=datetime.now(),
                             core_last_modified=datetime.now(),
                             core_abstract="Newer abstract",
                             core_source="Newer source",
                             core_year=2015,
                             core_authors=[],
                             core_keywords=[],
                             doc_website="",
                             conf_website="",
                             conf_pages="",
                             conf_month=0,
                             conf_city="",
                             tags=[])
        unified_document_title_to_documents = dict()
        unified_document_title_to_documents["samtetitle1"] = [
            document1, document3
        ]
        unified_document_title_to_documents["title2"] = [document2]

        # Trigger cache document update
        data_controller.crawl_data.update_cache_documents(
            unified_document_title_to_documents)

        # Check data count in the table
        cnt = data_controller.engine.execute(
            "SELECT COUNT(*) FROM cache_document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query sametitle row
        id = generate_id('sametitle1')
        row = data_controller.engine.execute("SELECT id, document_id, title "
                                             "FROM cache_document "
                                             "WHERE id='%s'" % id).fetchone()
        self.assertEqual(row["id"], id)
        self.assertEqual(row["title"], "sametitle1")

        # Now update documents and check if the link is set correctly
        data_controller.crawl_data.update_documents(
            [document1, document2, document3])
        row = data_controller.engine.execute(
            "SELECT cd.title as title "
            "FROM cache_document cd, document d "
            "WHERE cd.document_id = d.id "
            "AND cd.id='%s' " % id).fetchone()
        self.assertEqual(row["title"], "sametitle1")
Пример #17
0
    def test_update_cache_documents(self):
        sqlite_in_memory = SQLiteConfiguration("")
        data_controller = DataController(sqlite_in_memory)
        data_controller.run_schema()

        # The call shall not crash with empty input
        r = data_controller.crawl_data.update_cache_documents(dict())
        self.assertIsNone(r)

        document1 = Document(
            core_id="doc1",
            core_profile_id="id1",
            core_title="sametitle1",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="Older Abtract",
            core_source="Older source",
            core_year=2015,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        document2 = Document(
            core_id="doc2",
            core_profile_id="id2",
            core_title="title2",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="blabla2",
            core_source="ACM xyz",
            core_year=2014,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        document3 = Document(
            core_id="doc3",
            core_profile_id="id3",
            core_title="sametitle1",
            core_type="conference_proceedings",
            core_created=datetime.now(),
            core_last_modified=datetime.now(),
            core_abstract="Newer abstract",
            core_source="Newer source",
            core_year=2015,
            core_authors=[],
            core_keywords=[],
            doc_website="",
            conf_website="",
            conf_pages="",
            conf_month=0,
            conf_city="",
            tags=[]
        )
        unified_document_title_to_documents = dict()
        unified_document_title_to_documents["samtetitle1"] = [document1, document3]
        unified_document_title_to_documents["title2"] = [document2]

        # Trigger cache document update
        data_controller.crawl_data.update_cache_documents(unified_document_title_to_documents)

        # Check data count in the table
        cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_document").fetchone()
        self.assertEqual(cnt[0], 2)

        # Then query sametitle row
        id = generate_id('sametitle1')
        row = data_controller.engine.execute(
            "SELECT id, document_id, title "
            "FROM cache_document "
            "WHERE id='%s'" % id
        ).fetchone()
        self.assertEqual(row["id"], id)
        self.assertEqual(row["title"], "sametitle1")

        # Now update documents and check if the link is set correctly
        data_controller.crawl_data.update_documents([document1, document2, document3])
        row = data_controller.engine.execute(
            "SELECT cd.title as title "
            "FROM cache_document cd, document d "
            "WHERE cd.document_id = d.id "
            "AND cd.id='%s' " % id
        ).fetchone()
        self.assertEqual(row["title"], "sametitle1")