def setup(): NORMAL_TITLES = ("CREATE TABLE `Authors` (" " `Id` INT NOT NULL," " `main_name` TEXT," " `normal_name` TEXT," " `metaphone_name` TEXT," " PRIMARY KEY (`Id`)" ") ENGINE= {} CHARSET=utf8mb4") connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] connector.create_database(DB_NAME) connector.createTable("dvfds", NORMAL_TITLES.format(storage_engine)) try: connector.execute_ex( "CREATE FULLTEXT INDEX main_name_idx ON Authors (main_name)", ()) connector.execute_ex( "CREATE FULLTEXT INDEX normal_name_idx ON Authors (normal_name)", ()) connector.execute_ex( "CREATE FULLTEXT INDEX metaphone_name_idx ON Authors (metaphone_name)", ()) except: print("Index already exists") connector.close_connection()
def setup(): """ create database and table structure :return: """ connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] # create database connector.create_database(DB_NAME) connector.createTable("dates", DATE_TABLE.format(storage_engine)) connector.createTable("publication year", PUB_YEAR_TABLE.format(storage_engine)) connector.createTable("popular_words", POPULAR.format(storage_engine)) connector.createTable("title_length", TITLE_LENGTH.format(storage_engine)) connector.createTable("popular names", N_POPULAR.format(storage_engine)) connector.createTable("number authors", NUM_AUTHOR.format(storage_engine)) connector.createTable("Authors", AUTHORS.format(storage_engine)) connector.createTable("Normal Titles", NORMAL_TITLES.format(storage_engine)) connector.createTable("Career",CAREER.format(storage_engine)) # create index try: connector.execute_ex("CREATE FULLTEXT INDEX title_idx ON normal_title (titles)", ()) except: print("Index already exists") connector.close_connection()
def setup_tables(filename, table_query, insert_query): # load testconfig credentials = dict(get_config("MARIADBX")) # setup database connector = MariaDb(credentials) connector.create_db(TESTDB) connector.connector.database = TESTDB connector.createTable("test dblp table", table_query) # setup test ingester database # setup_database(TESTDB) # import records from csv with open(filename, newline='', encoding='utf-8') as csvfile: spamreader = csv.reader(csvfile, delimiter=';', quotechar='"') do_once = False for row in spamreader: # remove last updated and harvest date del row[-2:] # skip first line if do_once is True: tup = tuple(map(lambda x: x if x != "" else None, row)) connector.execute_ex(insert_query, tup) else: do_once = True connector.close_connection()
def tearDown(self): x = MariaDb(db=DB_NAME) try: x.execute_ex("DROP TABLE IF EXISTS `muhz`") except Exception as e: print(e) pass
def test_create_table(self): x = MariaDb() x.create_database(DB_NAME) x.createTable("test", ("CREATE TABLE `muhz` (" " `dblp_key` varchar(100) NOT NULL," " PRIMARY KEY (`dblp_key`)" ") ENGINE={} CHARSET=utf8mb4")) x.close_connection()
def tearDownClass(cls): connector = MariaDb(db="test_storage") connector.execute_ex( "ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx", ()) connector.execute_ex( "ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx", ()) connector.close_connection()
def setUpClass(cls): connector = MariaDb(db="test_storage") connector.execute_ex(( "CREATE FULLTEXT INDEX cluster_ft_idx ON test_storage.ingester_cluster (name)" ), ()) connector.execute_ex( "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)", ()) connector.close_connection()
def setup(TABLE_NAME): """ create database and table structure :return: """ connector = MariaDb() storage_engine = get_config("MISC")["storage_engine"] # create database connector.create_database(DB_NAME) connector.createTable(TABLE_NAME, TITLES.format(TABLE_NAME,storage_engine)) connector.close_connection()
def test_execute_ex(self): x = MariaDb() x.create_database(DB_NAME) x.createTable("test", ("CREATE TABLE `muhz` (" " `ID` int NOT NULL AUTO_INCREMENT," " `dblp_key` varchar(100) NOT NULL," " PRIMARY KEY (`ID`)" ") ENGINE={} CHARSET=utf8mb4")) idx = x.execute_ex("INSERT INTO muhz (dblp_key) VALUES (%s)", ('mi')) self.assertEqual(idx, 1) x.close_connection()
def setUp(self): self.connector = MariaDb(db="test_storage") storage_engine = get_config("MISC")["storage_engine"] # create tables for both sources arxiv and dblp self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine)) self.connector.createTable("arxivarticle", ARXIV_ARTICLE.format(storage_engine)) # insert data dblp_article = ( "dblpkey", # key "2011-11-11", # mdate "Andreas Anders;Bertha Theresa Balte;", # authors "The Ultimate Title", # title "10-14", # pages datetime.date(2005, 1, 1), # pub year "2", # volume "journal of stuff", # journal "3", # journal number "http://google.de", # doi "http://unused.com", # unused url None, # cite None, # crossref None, # booktitle None, # school None, # address None, # publisher None, # isbn None, # series "article" # type ) arxiv_article = ( "arxivkey", # identifier "2007-07-07", # created "2008-08-08", # updated "Andreas Anders;Bertha Theresa Balte;Carim Chass Jr.;", # authors "The Ultimate Title!", # title None, # mscclass None, # acmclass None, # reportno None, # journalref None, # comments "this is a test", # description "category", # categories "http://google.com", # doi "2009-09-09" # mdate ) self.connector.execute_ex(ADD_DBLP_ARTICLE, dblp_article) self.connector.execute_ex(ADD_ARXIV, arxiv_article)
def get_table_data(table, null_dates=True): credentials = dict(get_config("MARIADBX")) # connect to database connector = MariaDb(credentials) connector.connector.database = TESTDB # fetch everything query = "SELECT * FROM test_storage.{}".format(table) connector.cursor.execute(query) print(query) result = set() for dataset in connector.cursor: print(dataset) tmp = () for element in dataset: # overwrite timestamps with generic date for easier testing if null_dates and isinstance(element, datetime.datetime): tmp += ((datetime.datetime(1990, 1, 1, 1, 1, 1), )) else: tmp += (element, ) result.add(tmp) connector.close_connection() return result
def setUp(self): self.connector = MariaDb(db="test_storage") storage_engine = get_config("MISC")["storage_engine"] self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine))
from oai.queries import OAI_DATASET from mysqlWrapper.mariadb import MariaDb DB_NAME = 'oaimph' credentials = { 'user': '******', 'password': '******', 'host': '127.0.0.1', } try: database = MariaDb(credentials) except Exception as err: print(err) else: database.create_db(DB_NAME) database.createTable("oaimph", OAI_DATASET) database.close_connection()
def ingest_data(ingester_obj): if isinstance(ingester_obj, Iingester) is False: raise IIngester_Exception("Object is not of type IIngester") pub_added = 0 pub_limbo = 0 pub_duplicate = 0 logger = logging.getLogger("ingester.{}".format(ingester_obj.get_name())) # mysql connector to read from harvester read_connector = MariaDb() write_connector = MariaDb() try: read_connector.cursor.execute(ingester_obj.get_query()) except Exception as e: raise IIngester_Exception(e) globl_url_obj = global_url.objects.get(id=1) ingester_obj_global_url = ingester_obj.get_global_url() start_track = time() for query_dataset in read_connector.cursor: mapping = ingester_obj.mapping_function(query_dataset) write_connector.execute_ex(ingester_obj.update_harvested(), (mapping["local_url"],)) try: # 1. get Harvester specific record and parse to common-form dict # ------------------------- LOCAL_URL ---------------------------------------------------------------------- # check for duplicates by looking up the local URL try: local_url.objects.get(url=mapping["local_url"], global_url=ingester_obj_global_url) logger.info("%s: skip duplicate", mapping["local_url"]) pub_duplicate += 1 continue except ObjectDoesNotExist: pass # 2. create local url entry for record type_obj = match_type(mapping["publication"]["type_ids"]) source_lurl_obj = local_url.objects.create(url=mapping["local_url"], global_url=ingester_obj.get_global_url(), type=type_obj) # ------------------------- MATCHINGS ---------------------------------------------------------------------- # 3. find matching cluster for title and matching existing authors title_match = match_title2(mapping["publication"]["title"]) author_matches = advanced_author_match(mapping["authors"]) author_valid = True for author in author_matches: if author["status"] == Status.LIMBO: author_valid = False break # 4. ambiguous matching, push into limbo and delete local url record if title_match["status"] == Status.LIMBO or author_valid is False: logger.info("%s: Ambiguous title/authors", mapping["local_url"]) source_lurl_obj.delete() push_limbo(mapping, author_matches, str(title_match["reason"])) pub_limbo += 1 write_connector.execute_ex(ingester_obj.update_harvested(), (mapping["local_url"],)) continue # ------------------------ CREATION ------------------------------------------------------------------------ cluster_name = normalize_title(mapping["publication"]["title"]) pub_medium_obj = match_pub_medium(mapping["pub_release"], source_lurl_obj) author_ids = create_authors(author_matches, mapping["authors"], source_lurl_obj) keyword_obj = match_keywords(mapping["keywords"],source_lurl_obj) cluster_obj = create_title(title_match, cluster_name) # 5.create default publication / or find existing one and link with authors and cluster def_pub_obj, def_url_obj = create_publication(globl_url_obj, cluster_obj, author_ids, type_obj, pub_medium_obj, keyword_obj) # update local url with pub_medium_obj and study field source_lurl_obj.medium = pub_medium_obj source_lurl_obj.save() # 6.get /create diff tree mapping['publication']['url_id'] = source_lurl_obj.id # handle if there is no pub_medium if pub_medium_obj is not None: mapping['publication']['pub_source_ids'] = pub_medium_obj.id if len(keyword_obj) > 0: key_id_list = [] for keyword in keyword_obj: key_id_list.append(keyword.id) mapping['publication']['keyword_ids'] = key_id_list or None mapping['publication']['type_ids'] = type_obj.id diff_tree = update_diff_tree(def_pub_obj, mapping['publication'], author_ids) # 7.get default values from diff tree and re-serialize tree publication_values = get_default_values(diff_tree) get_default_ids(diff_tree, def_url_obj) serialized_tree = serialize_diff_store(diff_tree) # set missing values that are not default publication_values["differences"] = serialized_tree publication_values["cluster"] = cluster_obj publication_values["url"] = def_url_obj publication_values["date_published"] = datetime.date(publication_values["date_published"],1,1) # 8.store publication for key, value in publication_values.items(): setattr(def_pub_obj, key, value) def_pub_obj.save() logger.debug("%s: Publication added %s", mapping["local_url"], def_pub_obj) # 9.set references for publication ingester_obj.set_reference(source_lurl_obj, mapping['local_url']) pub_added += 1 except Exception as e: print(e) logger.error("%s: %s", mapping["local_url"], e) continue end_track = time() logger.error("TRACK:{}".format(end_track - start_track)) logger.debug("Terminate ingester %s", ingester_obj.get_name()) logger.info("publications added %s / limbo %s / skipped %s", pub_added,pub_limbo,pub_duplicate) read_connector.close_connection() write_connector.close_connection() return pub_added
def test_create_db(self): x = MariaDb() x.create_database(DB_NAME) x.create_database(DB_NAME) x.close_connection()
def test_init_success(self): MariaDb()