def test_update_profiles(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_profiles([]) self.assertIsNone(r) profile1 = Profile("id1", "Hans", "Mustermann", "", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") data_controller.crawl_data.update_profiles([profile1, profile2]) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM profile").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name " "FROM profile ").fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "id1") self.assertEqual(rows[0]["cache_profile_id"], generate_id("hansmustermann")) self.assertEqual(rows[0]['first_name'], "Hans") self.assertEqual(rows[0]['last_name'], "Mustermann") # Check second row self.assertEqual(rows[1]["mendeley_id"], "id2") self.assertEqual(rows[1]["cache_profile_id"], generate_id("maxmustermann")) self.assertEqual(rows[1]['first_name'], "Max") self.assertEqual(rows[1]['last_name'], "Mustermann") profile1 = Profile("id1", "Hans", "Supermann", "", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") data_controller.crawl_data.update_profiles([profile1, profile2]) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM profile").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name " "FROM profile ").fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "id1") self.assertEqual(rows[0]["cache_profile_id"], generate_id("hanssupermann")) self.assertEqual(rows[0]['first_name'], "Hans") self.assertEqual(rows[0]['last_name'], "Supermann")
def test_update_profiles(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_profiles([]) self.assertIsNone(r) profile1 = Profile("id1", "Hans", "Mustermann", "", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") data_controller.crawl_data.update_profiles([profile1, profile2]) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM profile").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name " "FROM profile " ).fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "id1") self.assertEqual(rows[0]["cache_profile_id"], generate_id("hansmustermann")) self.assertEqual(rows[0]['first_name'], "Hans") self.assertEqual(rows[0]['last_name'], "Mustermann") # Check second row self.assertEqual(rows[1]["mendeley_id"], "id2") self.assertEqual(rows[1]["cache_profile_id"], generate_id("maxmustermann")) self.assertEqual(rows[1]['first_name'], "Max") self.assertEqual(rows[1]['last_name'], "Mustermann") profile1 = Profile("id1", "Hans", "Supermann", "", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") data_controller.crawl_data.update_profiles([profile1, profile2]) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM profile").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_profile_id, first_name, last_name, display_name " "FROM profile " ).fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "id1") self.assertEqual(rows[0]["cache_profile_id"], generate_id("hanssupermann")) self.assertEqual(rows[0]['first_name'], "Hans") self.assertEqual(rows[0]['last_name'], "Supermann")
def update_cache_documents(self, unified_document_title_to_documents: {}): """ Given a unified_document_title to documents map, merges the documents and creates the FK references :param unified_document_title_to_documents: :return: """ sql = self._update_cache_documents[0] # Fire the sql script in a transaction with self._engine.begin() as conn: log.debug("Updating cache documents") for _, doc_list in unified_document_title_to_documents.items(): # flatten the document list down to one document reference_doc = None """:type : Document""" for doc in doc_list: if reference_doc is None or doc.core_last_modified > reference_doc.core_last_modified: reference_doc = doc # if we found at least one reference_doc (which we should), # add the corresponding sql insert string to the cache_document_strings array if reference_doc is not None: u, r = unify_document_title(reference_doc.core_title) b64u = generate_id(u) conn.execute(sql, (b64u, sanitize_text(r))) log.info("Cache documents have been updated")
def test_update_cache_fields(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_fields(dict()) self.assertIsNone(r) field1 = CacheField(title="field 1", unified_title="field1") field2 = CacheField(title="field 2", unified_title="field2") unified_field_title_to_field = dict() unified_field_title_to_field["field1"] = field1 unified_field_title_to_field["field2"] = field2 data_controller.crawl_data.update_cache_fields(unified_field_title_to_field) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_field").fetchone() self.assertEqual(cnt[0], 2) # Then query rows id = generate_id('field1') row = data_controller.engine.execute( "SELECT id, title " "FROM cache_field " "WHERE id='%s' " % id ).fetchone() # Check first row self.assertEqual(row["id"], id) self.assertEqual(row['title'], "field 1")
def test_update_cache_fields(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_fields(dict()) self.assertIsNone(r) field1 = CacheField(title="field 1", unified_title="field1") field2 = CacheField(title="field 2", unified_title="field2") unified_field_title_to_field = dict() unified_field_title_to_field["field1"] = field1 unified_field_title_to_field["field2"] = field2 data_controller.crawl_data.update_cache_fields( unified_field_title_to_field) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM cache_field").fetchone() self.assertEqual(cnt[0], 2) # Then query rows id = generate_id('field1') row = data_controller.engine.execute("SELECT id, title " "FROM cache_field " "WHERE id='%s' " % id).fetchone() # Check first row self.assertEqual(row["id"], id) self.assertEqual(row['title'], "field 1")
def update_cache_profiles(self, unified_name_to_profiles: {}): """ Given a unified_profile_name to profiles map, merges the profiles and creates the FK references :param unified_name_to_profiles: :param unified_name_to_real_name: :return: """ sql = self._update_cache_profiles[0] # Fire the sql script in a transaction with self._engine.begin() as conn: log.debug("Updating cache profiles") for _, profile_list in unified_name_to_profiles.items(): # flatten the profile list down to one profile reference_profile = None """:type : Profile""" for profile in profile_list: if reference_profile is None or len(profile.display_name) > len(reference_profile.display_name): reference_profile = profile # if we found at least one reference_profile (which we should) # add the corresponding sql insert string to the cache_profile_strings array if reference_profile is not None: u, r = unify_profile_name(reference_profile.first_name, reference_profile.last_name) b64u = generate_id(u) log.info("inserting %s, %s" % (b64u, sanitize_text(r))) conn.execute(sql, (b64u, sanitize_text(r))) log.info("Cache profiles have been updated")
def link_fields_to_documents(self, unified_field_title_to_documents: {}): """ Given a unified_field_title to documents map, creates the N:M relations in the database :param unified_field_title_to_documents: :return: """ # Get the different statements in the sql file delete = self._link_fields_to_documents[0] insert = self._link_fields_to_documents[1] # Fire the sql scripts in a transaction with self._engine.begin() as conn: log.debug("Deleting previous field -> document links") conn.execute(delete) log.debug("Inserting new field -> document links") for unified_field_title, doc_list in unified_field_title_to_documents.items(): for doc_unified in doc_list: conn.execute(insert, (generate_id(doc_unified), generate_id(unified_field_title))) log.info("Field -> document links have been updated")
def link_profiles_to_documents( self, unified_name_to_profiles: {}, unified_name_to_authored_documents: {}, unified_name_to_participated_documents: {}, ): """ Given a unified_profile_name to authored_documents and participated_documents map(s), creates the N:M relations in the database :param unified_name_to_authored_documents: :param unified_name_to_participated_documents: :return: """ # Get the different statements in the sql file delete = self._link_profiles_to_documents[0] insert = self._link_profiles_to_documents[1] # Fire the sql scripts in a transaction with self._engine.begin() as conn: log.debug("Deleting previous profile -> document links") conn.execute(delete) log.debug("Inserting new profile -> document links") for unified_name, doc_list in unified_name_to_authored_documents.items(): # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise) if unified_name not in unified_name_to_profiles: continue for doc_unified in doc_list: conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified))) for unified_name, doc_list in unified_name_to_participated_documents.items(): # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise) if unified_name not in unified_name_to_profiles: continue for doc_unified in doc_list: conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified))) log.info("Profile -> document links have been updated")
def insert_profile(conn: Connection, insert: str, p: Profile): u, _ = unify_profile_name(p.first_name, p.last_name) b64u = generate_id(u) conn.execute( insert, ( sanitize_text(p.identifier), b64u, sanitize_text(p.first_name), sanitize_text(p.last_name), sanitize_text(p.display_name), sanitize_text(p.link), ), )
def update_cache_fields(self, unified_field_title_to_field: {}): """ Given a unified_field_title to field map, updates the fields :param unified_field_title_to_field: :return: """ sql = self._update_cache_fields[0] # Fire the sql script in a transaction with self._engine.begin() as conn: log.debug("Updating cache fields") for _, field in unified_field_title_to_field.items(): b64u = generate_id(field.unified_title) conn.execute(sql, (b64u, sanitize_text(field.title))) log.info("Cache fields have been updated")
def test_update_cache_profiles(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_profiles(dict()) self.assertIsNone(r) profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") profile3 = Profile("id3", "Hans", "Mustermann", "", "") unified_name_to_profiles = dict() unified_name_to_profiles["hansmustermann"] = [profile1, profile3] unified_name_to_profiles["maxmustermann"] = [profile2] data_controller.crawl_data.update_cache_profiles( unified_name_to_profiles) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM cache_profile").fetchone() self.assertEqual(cnt[0], 2) # Then query same title row id = generate_id('hansmustermann') row = data_controller.engine.execute("SELECT id, profile_id, name " "FROM cache_profile " "WHERE id='%s'" % id).fetchone() self.assertEqual(row["id"], id) self.assertEqual(row["name"], "Hans Mustermann") # Now update profiles and check if the link is set correctly data_controller.crawl_data.update_profiles( [profile1, profile2, profile3]) row = data_controller.engine.execute( "SELECT cp.name as name " "FROM cache_profile cp, profile p " "WHERE cp.profile_id = p.id " "AND cp.id='%s' " % id).fetchone() self.assertEqual(row["name"], "Hans Mustermann")
def test_update_cache_profiles(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_profiles(dict()) self.assertIsNone(r) profile1 = Profile("id1", "Hans", "Mustermann", "Longer Real Name", "") profile2 = Profile("id2", "Max", "Mustermann", "", "") profile3 = Profile("id3", "Hans", "Mustermann", "", "") unified_name_to_profiles = dict() unified_name_to_profiles["hansmustermann"] = [profile1, profile3] unified_name_to_profiles["maxmustermann"] = [profile2] data_controller.crawl_data.update_cache_profiles(unified_name_to_profiles) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_profile").fetchone() self.assertEqual(cnt[0], 2) # Then query same title row id = generate_id('hansmustermann') row = data_controller.engine.execute( "SELECT id, profile_id, name " "FROM cache_profile " "WHERE id='%s'" % id ).fetchone() self.assertEqual(row["id"], id) self.assertEqual(row["name"], "Hans Mustermann") # Now update profiles and check if the link is set correctly data_controller.crawl_data.update_profiles([profile1, profile2, profile3]) row = data_controller.engine.execute( "SELECT cp.name as name " "FROM cache_profile cp, profile p " "WHERE cp.profile_id = p.id " "AND cp.id='%s' " % id ).fetchone() self.assertEqual(row["name"], "Hans Mustermann")
def insert_doc(conn: Connection, insert: str, doc: Document): u, _ = unify_document_title(doc.core_title) b64u = generate_id(u) author_string = map(lambda x: "{first} {last}".format(first=x[0], last=x[1]), doc.core_authors) # Create strings authors_string = ", ".join(author_string) keywords_string = ", ".join(doc.core_keywords) tags_string = ", ".join(doc.tags) # Create bibtex bibtex = generate_bibtex(doc) # Insert tuple conn.execute( insert, ( sanitize_text(doc.core_id), b64u, sanitize_text(doc.core_profile_id), sanitize_text(doc.core_title), sanitize_text(doc.core_type), datetime_to_sqltime(doc.core_created), datetime_to_sqltime(doc.core_last_modified), sanitize_text(doc.core_abstract), sanitize_text(doc.core_source), doc.core_year, sanitize_text(authors_string), sanitize_text(keywords_string), sanitize_text(tags_string), sanitize_text(doc.doc_website), sanitize_text(doc.conf_website), doc.conf_month, sanitize_text(doc.conf_pages), sanitize_text(doc.conf_city), sanitize_text(bibtex), ), )
def test_update_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_documents([]) self.assertIsNone(r) document1 = Document(core_id="doc1", core_profile_id="id1", core_title="title1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla", core_source="ACM xy", core_year=2015, core_authors=[("Hans", "Mustermann"), ("Nicht", "Existent")], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=["t ag- 1"]) document2 = Document(core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) data_controller.crawl_data.update_documents([document1, document2]) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM document").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id," " title, doc_type, created, last_modified, abstract, source, pub_year " "FROM document ").fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "doc1") self.assertEqual(rows[0]["cache_document_id"], generate_id("title1")) self.assertEqual(rows[0]["owner_mendeley_id"], "id1") self.assertEqual(rows[0]['title'], "title1") self.assertEqual(rows[0]["doc_type"], "conference_proceedings") self.assertEqual(rows[0]['abstract'], "blabla") self.assertEqual(rows[0]["source"], "ACM xy") self.assertEqual(rows[0]["pub_year"], 2015) # Check second row self.assertEqual(rows[1]["mendeley_id"], "doc2") self.assertEqual(rows[1]["cache_document_id"], generate_id("title2")) self.assertEqual(rows[1]["owner_mendeley_id"], "id2") self.assertEqual(rows[1]['title'], "title2") self.assertEqual(rows[1]["doc_type"], "conference_proceedings") self.assertEqual(rows[1]['abstract'], "blabla2") self.assertEqual(rows[1]["source"], "ACM xyz") self.assertEqual(rows[1]["pub_year"], 2014) document1 = Document(core_id="doc1", core_profile_id="id1", core_title="newtitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blablaNew", core_source="ACM xyz1", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) document2 = Document(core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) data_controller.crawl_data.update_documents([document1, document2]) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM document").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id," " title, doc_type, created, last_modified, abstract, source, pub_year " "FROM document ").fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "doc1") self.assertEqual(rows[0]["cache_document_id"], generate_id("newtitle1")) self.assertEqual(rows[0]["owner_mendeley_id"], "id1") self.assertEqual(rows[0]['title'], "newtitle1") self.assertEqual(rows[0]["doc_type"], "conference_proceedings") self.assertEqual(rows[0]['abstract'], "blablaNew") self.assertEqual(rows[0]["source"], "ACM xyz1") self.assertEqual(rows[0]["pub_year"], 2015)
def test_update_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_documents([]) self.assertIsNone(r) document1 = Document( core_id="doc1", core_profile_id="id1", core_title="title1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla", core_source="ACM xy", core_year=2015, core_authors=[("Hans", "Mustermann"), ("Nicht", "Existent")], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=["t ag- 1"] ) document2 = Document( core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) data_controller.crawl_data.update_documents([document1, document2]) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM document").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id," " title, doc_type, created, last_modified, abstract, source, pub_year " "FROM document " ).fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "doc1") self.assertEqual(rows[0]["cache_document_id"], generate_id("title1")) self.assertEqual(rows[0]["owner_mendeley_id"], "id1") self.assertEqual(rows[0]['title'], "title1") self.assertEqual(rows[0]["doc_type"], "conference_proceedings") self.assertEqual(rows[0]['abstract'], "blabla") self.assertEqual(rows[0]["source"], "ACM xy") self.assertEqual(rows[0]["pub_year"], 2015) # Check second row self.assertEqual(rows[1]["mendeley_id"], "doc2") self.assertEqual(rows[1]["cache_document_id"], generate_id("title2")) self.assertEqual(rows[1]["owner_mendeley_id"], "id2") self.assertEqual(rows[1]['title'], "title2") self.assertEqual(rows[1]["doc_type"], "conference_proceedings") self.assertEqual(rows[1]['abstract'], "blabla2") self.assertEqual(rows[1]["source"], "ACM xyz") self.assertEqual(rows[1]["pub_year"], 2014) document1 = Document( core_id="doc1", core_profile_id="id1", core_title="newtitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blablaNew", core_source="ACM xyz1", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) document2 = Document( core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) data_controller.crawl_data.update_documents([document1, document2]) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM document").fetchone() self.assertEqual(cnt[0], 2) # Then query rows rows = data_controller.engine.execute( "SELECT id, mendeley_id, cache_document_id, owner_mendeley_id," " title, doc_type, created, last_modified, abstract, source, pub_year " "FROM document " ).fetchall() # Check first row self.assertEqual(rows[0]["mendeley_id"], "doc1") self.assertEqual(rows[0]["cache_document_id"], generate_id("newtitle1")) self.assertEqual(rows[0]["owner_mendeley_id"], "id1") self.assertEqual(rows[0]['title'], "newtitle1") self.assertEqual(rows[0]["doc_type"], "conference_proceedings") self.assertEqual(rows[0]['abstract'], "blablaNew") self.assertEqual(rows[0]["source"], "ACM xyz1") self.assertEqual(rows[0]["pub_year"], 2015)
def test_update_cache_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_documents(dict()) self.assertIsNone(r) document1 = Document(core_id="doc1", core_profile_id="id1", core_title="sametitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="Older Abtract", core_source="Older source", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) document2 = Document(core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) document3 = Document(core_id="doc3", core_profile_id="id3", core_title="sametitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="Newer abstract", core_source="Newer source", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[]) unified_document_title_to_documents = dict() unified_document_title_to_documents["samtetitle1"] = [ document1, document3 ] unified_document_title_to_documents["title2"] = [document2] # Trigger cache document update data_controller.crawl_data.update_cache_documents( unified_document_title_to_documents) # Check data count in the table cnt = data_controller.engine.execute( "SELECT COUNT(*) FROM cache_document").fetchone() self.assertEqual(cnt[0], 2) # Then query sametitle row id = generate_id('sametitle1') row = data_controller.engine.execute("SELECT id, document_id, title " "FROM cache_document " "WHERE id='%s'" % id).fetchone() self.assertEqual(row["id"], id) self.assertEqual(row["title"], "sametitle1") # Now update documents and check if the link is set correctly data_controller.crawl_data.update_documents( [document1, document2, document3]) row = data_controller.engine.execute( "SELECT cd.title as title " "FROM cache_document cd, document d " "WHERE cd.document_id = d.id " "AND cd.id='%s' " % id).fetchone() self.assertEqual(row["title"], "sametitle1")
def test_update_cache_documents(self): sqlite_in_memory = SQLiteConfiguration("") data_controller = DataController(sqlite_in_memory) data_controller.run_schema() # The call shall not crash with empty input r = data_controller.crawl_data.update_cache_documents(dict()) self.assertIsNone(r) document1 = Document( core_id="doc1", core_profile_id="id1", core_title="sametitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="Older Abtract", core_source="Older source", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) document2 = Document( core_id="doc2", core_profile_id="id2", core_title="title2", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="blabla2", core_source="ACM xyz", core_year=2014, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) document3 = Document( core_id="doc3", core_profile_id="id3", core_title="sametitle1", core_type="conference_proceedings", core_created=datetime.now(), core_last_modified=datetime.now(), core_abstract="Newer abstract", core_source="Newer source", core_year=2015, core_authors=[], core_keywords=[], doc_website="", conf_website="", conf_pages="", conf_month=0, conf_city="", tags=[] ) unified_document_title_to_documents = dict() unified_document_title_to_documents["samtetitle1"] = [document1, document3] unified_document_title_to_documents["title2"] = [document2] # Trigger cache document update data_controller.crawl_data.update_cache_documents(unified_document_title_to_documents) # Check data count in the table cnt = data_controller.engine.execute("SELECT COUNT(*) FROM cache_document").fetchone() self.assertEqual(cnt[0], 2) # Then query sametitle row id = generate_id('sametitle1') row = data_controller.engine.execute( "SELECT id, document_id, title " "FROM cache_document " "WHERE id='%s'" % id ).fetchone() self.assertEqual(row["id"], id) self.assertEqual(row["title"], "sametitle1") # Now update documents and check if the link is set correctly data_controller.crawl_data.update_documents([document1, document2, document3]) row = data_controller.engine.execute( "SELECT cd.title as title " "FROM cache_document cd, document d " "WHERE cd.document_id = d.id " "AND cd.id='%s' " % id ).fetchone() self.assertEqual(row["title"], "sametitle1")