def setup_tables(filename, table_query, insert_query): # load testconfig credentials = dict(get_config("MARIADBX")) # setup database connector = MariaDb(credentials) connector.create_db(TESTDB) connector.connector.database = TESTDB connector.createTable("test dblp table", table_query) # setup test ingester database # setup_database(TESTDB) # import records from csv with open(filename, newline='', encoding='utf-8') as csvfile: spamreader = csv.reader(csvfile, delimiter=';', quotechar='"') do_once = False for row in spamreader: # remove last updated and harvest date del row[-2:] # skip first line if do_once is True: tup = tuple(map(lambda x: x if x != "" else None, row)) connector.execute_ex(insert_query, tup) else: do_once = True connector.close_connection()
def setup(): NORMAL_TITLES = ("CREATE TABLE `Authors` (" " `Id` INT NOT NULL," " `main_name` TEXT," " `normal_name` TEXT," " `metaphone_name` TEXT," " PRIMARY KEY (`Id`)" ") ENGINE= {} CHARSET=utf8mb4") connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] connector.create_database(DB_NAME) connector.createTable("dvfds", NORMAL_TITLES.format(storage_engine)) try: connector.execute_ex( "CREATE FULLTEXT INDEX main_name_idx ON Authors (main_name)", ()) connector.execute_ex( "CREATE FULLTEXT INDEX normal_name_idx ON Authors (normal_name)", ()) connector.execute_ex( "CREATE FULLTEXT INDEX metaphone_name_idx ON Authors (metaphone_name)", ()) except: print("Index already exists") connector.close_connection()
def test_create_table(self): x = MariaDb() x.create_database(DB_NAME) x.createTable("test", ("CREATE TABLE `muhz` (" " `dblp_key` varchar(100) NOT NULL," " PRIMARY KEY (`dblp_key`)" ") ENGINE={} CHARSET=utf8mb4")) x.close_connection()
def setup(TABLE_NAME): """ create database and table structure :return: """ connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] # create database connector.create_database(DB_NAME) connector.createTable(TABLE_NAME, TITLES.format(TABLE_NAME,storage_engine)) connector.close_connection()
def test_execute_ex(self): x = MariaDb() x.create_database(DB_NAME) x.createTable("test", ("CREATE TABLE `muhz` (" " `ID` int NOT NULL AUTO_INCREMENT," " `dblp_key` varchar(100) NOT NULL," " PRIMARY KEY (`ID`)" ") ENGINE={} CHARSET=utf8mb4")) idx = x.execute_ex("INSERT INTO muhz (dblp_key) VALUES (%s)", ('mi')) self.assertEqual(idx, 1) x.close_connection()
class TestIngesterMulti2(TransactionTestCase): fixtures = [os.path.join(ingester_path, "fixtures", "initial_data.json")] @classmethod def setUpClass(cls): connector = MariaDb(db="test_storage") connector.execute_ex(( "CREATE FULLTEXT INDEX cluster_ft_idx ON test_storage.ingester_cluster (name)" ), ()) connector.execute_ex( "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)", ()) connector.close_connection() @classmethod def tearDownClass(cls): connector = MariaDb(db="test_storage") connector.execute_ex( "ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx", ()) connector.execute_ex( "ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx", ()) connector.close_connection() def setUp(self): self.connector = MariaDb(db="test_storage") storage_engine = get_config("MISC")["storage_engine"] # create tables for both sources arxiv and dblp self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine)) self.connector.createTable("arxivarticle", ARXIV_ARTICLE.format(storage_engine)) # insert data dblp_article = ( "dblpkey", # key "2011-11-11", # mdate "Andreas Anders;Bertha Theresa Balte;", # authors "The Ultimate Title", # title "10-14", # pages datetime.date(2005, 1, 1), # pub year "2", # volume "journal of stuff", # journal "3", # journal number "http://google.de", # doi "http://unused.com", # unused url None, # cite None, # crossref None, # booktitle None, # school None, # address None, # publisher None, # isbn None, # series "article" # type ) arxiv_article = ( "arxivkey", # identifier "2007-07-07", # created "2008-08-08", # updated "Andreas Anders;Bertha Theresa Balte;Carim Chass Jr.;", # authors "The Ultimate Title!", # title None, # mscclass None, # acmclass None, # reportno None, # journalref None, # comments "this is a test", # description "category", # categories "http://google.com", # doi "2009-09-09" # mdate ) self.connector.execute_ex(ADD_DBLP_ARTICLE, dblp_article) self.connector.execute_ex(ADD_ARXIV, arxiv_article) def tearDown(self): self.connector.execute_ex("DROP TABLE test_storage.arxiv_articles") self.connector.execute_ex("DROP TABLE test_storage.dblp_article") self.connector.close_connection() def test_success_reversed(self): dblpingester = DblpIngester("dblp.ingester", harvesterdb="test_storage") arxivingester = ArxivIngester("arxiv.ingester", harvester_db="test_storage") # arxiv first then dblp result2 = ingest_data(arxivingester) self.assertEqual(result2, 1) result = ingest_data(dblpingester) self.assertEqual(result, 1) # check all tables self.assertEqual(cluster.objects.count(), 1) self.assertEqual(publication.objects.count(), 1) self.assertEqual(local_url.objects.count(), 3) self.assertEqual(global_url.objects.count(), 4) self.assertEqual(limbo_authors.objects.count(), 0) self.assertEqual(limbo_pub.objects.count(), 0) self.assertEqual(pub_medium.objects.count(), 1) # check local url dblp_url = local_url.objects.get(id=3) pub_url = local_url.objects.get(id=2) arxiv_url = local_url.objects.get(id=1) self.assertEqual(dblp_url.test(), [ 3, "dblpkey", 1, publication_type.objects.get(name="article").id, None ]) self.assertEqual(arxiv_url.test(), [ 4, "arxivkey", None, publication_type.objects.get(name="misc").id, None ]) self.assertEqual(pub_url.test(), [ 1, "TODO PLATZHALTER", 1, publication_type.objects.get(name="misc").id, None ]) # check authors self.assertEqual(authors_model.objects.count(), 3) self.assertEqual(author_aliases.objects.count(), 3) self.assertEqual(author_alias_source.objects.count(), 5) # publication authors self.assertEqual(publication_author.objects.count(), 8) # check publication publ = publication.objects.first() self.assertEqual(publ.title, "The Ultimate Title!") # from Arxiv self.assertEqual(publ.pages, "10-14") # DBLP self.assertEqual(publ.note, None) self.assertEqual(publ.doi, "http://google.com") # Arxiv self.assertEqual(publ.abstract, "this is a test") # arxiv self.assertEqual(publ.copyright, None) self.assertEqual(publ.date_added, None) self.assertEqual(publ.date_published, datetime.date(2007, 1, 1)) # DBLP self.assertEqual(publ.volume, "2") # DBLP self.assertEqual(publ.number, "3") # DBLP # check diff tree diff = deserialize_diff_store(publ.differences) self.assertEqual(diff["url_id"], [1, 3]) self.assertEqual(diff["doi"], [{ "bitvector": 1, "votes": 0, "value": "http://google.com" }, { "bitvector": 2, "votes": 0, "value": "http://google.de" }]) self.assertEqual(diff["copyright"], []) self.assertEqual(diff["type_ids"], [{ "bitvector": 1, "votes": 0, "value": 2 }, { "bitvector": 2, "votes": 0, "value": 1 }]) self.assertEqual(diff["pages"], [{ "bitvector": 2, "votes": 0, "value": "10-14" }]) self.assertEqual(OpenReferences.objects.first().test(), [1, 'arxivkey', None])
class TestIngester(TransactionTestCase): fixtures = [os.path.join(ingester_path, "fixtures", "initial_data.json")] @classmethod def setUpClass(cls): connector = MariaDb(db="test_storage") connector.execute_ex("CREATE FULLTEXT INDEX cluster_ft_idx ON test_storage.ingester_cluster (name)", ()) connector.execute_ex( "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)", ()) connector.close_connection() @classmethod def tearDownClass(cls): connector = MariaDb(db="test_storage") connector.execute_ex("ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx", ()) connector.execute_ex("ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx", ()) connector.close_connection() def setUp(self): self.connector = MariaDb(db="test_storage") storage_engine = get_config("MISC")["storage_engine"] self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine)) def tearDown(self): self.connector.execute_ex("DROP TABLE test_storage.dblp_article") self.connector.close_connection() def test_invalid_ingester(self): setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE) self.assertRaises(IIngester_Exception, ingest_data, datetime.datetime(1990,1,1,1,1,1)) def test_success(self): setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE) ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage") self.assertEqual(ingester.get_global_url().id, 3) result = ingest_data(ingester) self.assertEqual(result, 2) # check local url self.assertEqual(local_url.objects.get(id=1).test(), [3, 'journals/acta/AkyildizB89', 1, 1,None]) self.assertEqual(local_url.objects.get(id=2).test(), [1, 'TODO PLATZHALTER', 1, 1,None]) self.assertEqual(local_url.objects.get(id=3).test(), [3, 'journals/acta/VoglerS014', 1, 1,None]) self.assertEqual(local_url.objects.get(id=4).test(), [1, 'TODO PLATZHALTER', 1, 1,None]) # check authors_model self.assertEqual(authors_model.objects.get(id=1).test(),["Ian F. Akyildiz", "ian f akyildiz"]) self.assertEqual(authors_model.objects.get(id=2).test(), ["Horst von Brand", "horst von brand"]) self.assertEqual(authors_model.objects.get(id=3).test(), ["Walter Vogler", "walter vogler"]) self.assertEqual(authors_model.objects.get(id=4).test(), ["Christian Stahl", "christian stahl"]) self.assertEqual(authors_model.objects.get(id=5).test(), ["Richard Müller", "richard muller"]) # check author alias self.assertEqual(author_aliases.objects.get(id=1).test(), [1, "Ian F. Akyildiz"]) self.assertEqual(author_aliases.objects.get(id=2).test(), [2, "Horst von Brand"]) self.assertEqual(author_aliases.objects.get(id=3).test(), [3, "Walter Vogler"]) self.assertEqual(author_aliases.objects.get(id=4).test(), [4, "Christian Stahl"]) self.assertEqual(author_aliases.objects.get(id=5).test(), [5, "Richard Müller 0001"]) self.assertEqual(author_aliases.objects.get(id=6).test(), [5, "Richard Müller"]) # cluster self.assertEqual(cluster.objects.get(id=1).name, "bla bla bla") self.assertEqual(cluster.objects.get(id=2).name, "kam kim kum") # author alias source self.assertEqual(author_alias_source.objects.get(id=1).test(), [1, 1]) self.assertEqual(author_alias_source.objects.get(id=2).test(), [2, 1]) self.assertEqual(author_alias_source.objects.get(id=3).test(), [3, 3]) self.assertEqual(author_alias_source.objects.get(id=4).test(), [4, 3]) self.assertEqual(author_alias_source.objects.get(id=5).test(), [5, 3]) self.assertEqual(author_alias_source.objects.get(id=6).test(), [6, 3]) # publication authors self.assertEqual(publication_author.objects.get(id=1).test(), [1, 1, 0]) self.assertEqual(publication_author.objects.get(id=2).test(), [1, 2, 1]) self.assertEqual(publication_author.objects.get(id=3).test(), [2, 1, 0]) self.assertEqual(publication_author.objects.get(id=4).test(), [2, 2, 1]) self.assertEqual(publication_author.objects.get(id=5).test(), [3, 3, 0]) self.assertEqual(publication_author.objects.get(id=6).test(), [3, 4, 1]) self.assertEqual(publication_author.objects.get(id=7).test(), [3, 5, 2]) self.assertEqual(publication_author.objects.get(id=8).test(), [4, 3, 0]) self.assertEqual(publication_author.objects.get(id=9).test(), [4, 4, 1]) self.assertEqual(publication_author.objects.get(id=10).test(), [4, 5, 2]) # limbo self.assertEqual(limbo_authors.objects.count(),0) self.assertEqual(limbo_pub.objects.count(),0) # publication self.assertEqual(publication.objects.get(id=1).test(), [1, "Bla Bla Bla"]) self.assertEqual(publication.objects.get(id=2).test(), [2, "Kam? Kim! Kum."]) # check if last harvested is set tmp = list(get_table_data("dblp_article", null_dates=False)) self.assertEqual(tmp[0][-1].strftime("%Y-%m-%d"), datetime.datetime.now().strftime("%Y-%m-%d")) # check open references self.assertEqual(OpenReferences.objects.count(), 0) def test_success_limit(self): setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE) ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage") ingester.set_limit(1) result = ingest_data(ingester) self.assertEqual(result, 1) # check open references self.assertEqual(OpenReferences.objects.count(), 0) def test_complete_publication(self): # for this test a dataset with ALL ROWS filled, will be created to check if all values are # successfully transferred setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE) ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage") ingest_data(ingester) publ = publication.objects.first() self.assertEqual(publ.title,"title") self.assertEqual(publ.pages, "1-5") self.assertEqual(publ.doi, "doi") self.assertEqual(publ.abstract, None) self.assertEqual(publ.copyright, None) self.assertEqual(publ.volume, "1") self.assertEqual(publ.number, "2") self.assertEqual(publ.note, None) self.assertEqual(publ.date_added, None) self.assertEqual(publ.date_published, datetime.date(1990,1,1)) # check open references self.assertEqual(OpenReferences.objects.count(), 0) def test_limbo_multi_cluster(self): setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE) cluster.objects.bulk_create([ cluster(id=1, name="title"), cluster(id=2, name="title"), ]) ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage") ingest_data(ingester) self.assertEqual(limbo_authors.objects.get(id=1).test(), [1, 'None', "An Author", 0]) self.assertEqual(limbo_authors.objects.get(id=2).test(), [1, 'None', "Another Author", 1]) self.assertEqual(local_url.objects.count(),0) limbo = limbo_pub.objects.get(id=1).test_extended() print(limbo) compare = ['Reason.AMB_CLUSTER','key',"title","1-5",None,"doi",None,None, None,datetime.date(1990,1,1),"1","2","series", None,"publisher",None,"school","address", "isbn",None,"booktitle","journal"] self.assertEqual(limbo,compare) def test_limbo_multi_pubs(self): setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE) cl = cluster.objects.create(id=1, name="title") gurl = global_url.objects.create(id=5,domain ="http://dummy.de", url="http://dummy.de") lurl = local_url.objects.create(id=1,url="jlkjöl", global_url=gurl) publication.objects.bulk_create([ publication(local_url=lurl,cluster=cl,title="Title"), publication(local_url=lurl, cluster=cl, title="Title") ]) ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage") ingest_data(ingester) limbo = limbo_pub.objects.get(id=1).test_extended() self.assertEqual(limbo[0],'Reason.AMB_PUB') self.assertEqual(limbo_authors.objects.get(id=1).test(), [1, 'None', "An Author", 0]) self.assertEqual(limbo_authors.objects.get(id=2).test(), [1, 'None', "Another Author", 1]) self.assertEqual(local_url.objects.count(),1) # check open references self.assertEqual(OpenReferences.objects.count(), 0) def test_limbo_alias(self): setup_tables(os.path.join(test_path, "dblp_test3.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE) ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage") ingest_data(ingester) self.assertEqual(limbo_pub.objects.count(), 0) self.assertEqual(cluster.objects.count(), 3) self.assertEqual(authors_model.objects.count(), 5) # check open references self.assertEqual(OpenReferences.objects.count(), 0) def test_set_last_harvested(self): setup_tables(os.path.join(test_path, "dblp_test3.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE) ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage") ingester.set_limit(1) result1 = ingest_data(ingester) self.assertEqual(result1, 1) ingester.set_limit(3) result2 = ingest_data(ingester) self.assertEqual(result2, 2) # check open references self.assertEqual(OpenReferences.objects.count(), 0)
from oai.queries import OAI_DATASET from mysqlWrapper.mariadb import MariaDb DB_NAME = 'oaimph' credentials = { 'user': '******', 'password': '******', 'host': '127.0.0.1', } try: database = MariaDb(credentials) except Exception as err: print(err) else: database.create_db(DB_NAME) database.createTable("oaimph", OAI_DATASET) database.close_connection()
def setup(): """ create database and table structure :return: """ connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] # create database connector.create_database(DB_NAME) connector.createTable("dates", DATE_TABLE.format(storage_engine)) connector.createTable("publication year", PUB_YEAR_TABLE.format(storage_engine)) connector.createTable("popular_words", POPULAR.format(storage_engine)) connector.createTable("title_length", TITLE_LENGTH.format(storage_engine)) connector.createTable("popular names", N_POPULAR.format(storage_engine)) connector.createTable("number authors", NUM_AUTHOR.format(storage_engine)) connector.createTable("Authors", AUTHORS.format(storage_engine)) connector.createTable("Normal Titles", NORMAL_TITLES.format(storage_engine)) connector.createTable("Career",CAREER.format(storage_engine)) # create index try: connector.execute_ex("CREATE FULLTEXT INDEX title_idx ON normal_title (titles)", ()) except: print("Index already exists") connector.close_connection()