Exemplo n.º 1
0
def setup():
    NORMAL_TITLES = ("CREATE TABLE `Authors` ("
                     " `Id` INT NOT NULL,"
                     " `main_name` TEXT,"
                     " `normal_name` TEXT,"
                     " `metaphone_name` TEXT,"
                     "  PRIMARY KEY (`Id`)"
                     ") ENGINE= {} CHARSET=utf8mb4")
    connector = MariaDb()
    storage_engine = get_config("MISC")["storage_engine"]
    connector.create_database(DB_NAME)
    connector.createTable("dvfds", NORMAL_TITLES.format(storage_engine))
    try:
        connector.execute_ex(
            "CREATE FULLTEXT INDEX main_name_idx  ON Authors (main_name)", ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX normal_name_idx  ON Authors (normal_name)",
            ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX metaphone_name_idx  ON Authors (metaphone_name)",
            ())
    except:
        print("Index already exists")

    connector.close_connection()
Exemplo n.º 2
0
def setup():
    """
    create database and table structure
    :return:
    """
    connector = MariaDb()

    storage_engine = get_config("MISC")["storage_engine"]

    # create database
    connector.create_database(DB_NAME)
    connector.createTable("dates", DATE_TABLE.format(storage_engine))
    connector.createTable("publication year", PUB_YEAR_TABLE.format(storage_engine))
    connector.createTable("popular_words", POPULAR.format(storage_engine))
    connector.createTable("title_length", TITLE_LENGTH.format(storage_engine))
    connector.createTable("popular names", N_POPULAR.format(storage_engine))
    connector.createTable("number authors", NUM_AUTHOR.format(storage_engine))
    connector.createTable("Authors", AUTHORS.format(storage_engine))
    connector.createTable("Normal Titles", NORMAL_TITLES.format(storage_engine))
    connector.createTable("Career",CAREER.format(storage_engine))
    # create index
    try:
        connector.execute_ex("CREATE FULLTEXT INDEX title_idx  ON normal_title (titles)", ())
    except:
        print("Index already exists")

    connector.close_connection()
Exemplo n.º 3
0
def setup_tables(filename, table_query, insert_query):
    # load testconfig
    credentials = dict(get_config("MARIADBX"))
    # setup database
    connector = MariaDb(credentials)
    connector.create_db(TESTDB)
    connector.connector.database = TESTDB
    connector.createTable("test dblp table", table_query)

    # setup test ingester database
    # setup_database(TESTDB)
    # import records from csv
    with open(filename, newline='', encoding='utf-8') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=';', quotechar='"')
        do_once = False
        for row in spamreader:
            # remove last updated and harvest date
            del row[-2:]
            # skip first line
            if do_once is True:
                tup = tuple(map(lambda x: x if x != "" else None, row))
                connector.execute_ex(insert_query, tup)
            else:
                do_once = True
    connector.close_connection()
Exemplo n.º 4
0
 def tearDown(self):
     x = MariaDb(db=DB_NAME)
     try:
         x.execute_ex("DROP TABLE IF EXISTS `muhz`")
     except Exception as e:
         print(e)
         pass
Exemplo n.º 5
0
 def test_create_table(self):
     x = MariaDb()
     x.create_database(DB_NAME)
     x.createTable("test", ("CREATE TABLE `muhz` ("
                            "  `dblp_key` varchar(100) NOT NULL,"
                            "  PRIMARY KEY (`dblp_key`)"
                            ") ENGINE={} CHARSET=utf8mb4"))
     x.close_connection()
Exemplo n.º 6
0
 def tearDownClass(cls):
     connector = MariaDb(db="test_storage")
     connector.execute_ex(
         "ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx",
         ())
     connector.execute_ex(
         "ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx",
         ())
     connector.close_connection()
Exemplo n.º 7
0
 def setUpClass(cls):
     connector = MariaDb(db="test_storage")
     connector.execute_ex((
         "CREATE FULLTEXT INDEX cluster_ft_idx  ON test_storage.ingester_cluster (name)"
     ), ())
     connector.execute_ex(
         "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)",
         ())
     connector.close_connection()
Exemplo n.º 8
0
def setup(TABLE_NAME):
    """
    create database and table structure
    :return:
    """
    connector = MariaDb()
    storage_engine = get_config("MISC")["storage_engine"]
    # create database
    connector.create_database(DB_NAME)
    connector.createTable(TABLE_NAME, TITLES.format(TABLE_NAME,storage_engine))
    connector.close_connection()
Exemplo n.º 9
0
 def test_execute_ex(self):
     x = MariaDb()
     x.create_database(DB_NAME)
     x.createTable("test", ("CREATE TABLE `muhz` ("
                            "  `ID` int NOT NULL AUTO_INCREMENT,"
                            "  `dblp_key` varchar(100) NOT NULL,"
                            "  PRIMARY KEY (`ID`)"
                            ") ENGINE={} CHARSET=utf8mb4"))
     idx = x.execute_ex("INSERT INTO muhz (dblp_key) VALUES (%s)", ('mi'))
     self.assertEqual(idx, 1)
     x.close_connection()
Exemplo n.º 10
0
    def setUp(self):
        self.connector = MariaDb(db="test_storage")
        storage_engine = get_config("MISC")["storage_engine"]
        # create tables for both sources arxiv and dblp
        self.connector.createTable("dblparticle",
                                   DBLP_ARTICLE.format(storage_engine))
        self.connector.createTable("arxivarticle",
                                   ARXIV_ARTICLE.format(storage_engine))
        # insert data
        dblp_article = (
            "dblpkey",  # key
            "2011-11-11",  # mdate
            "Andreas Anders;Bertha Theresa Balte;",  # authors
            "The Ultimate Title",  # title
            "10-14",  # pages
            datetime.date(2005, 1, 1),  # pub year
            "2",  # volume
            "journal of stuff",  # journal
            "3",  # journal number
            "http://google.de",  # doi
            "http://unused.com",  # unused url
            None,  # cite
            None,  # crossref
            None,  # booktitle
            None,  # school
            None,  # address
            None,  # publisher
            None,  # isbn
            None,  # series
            "article"  # type
        )

        arxiv_article = (
            "arxivkey",  # identifier
            "2007-07-07",  # created
            "2008-08-08",  # updated
            "Andreas Anders;Bertha Theresa Balte;Carim Chass Jr.;",  # authors
            "The Ultimate Title!",  # title
            None,  # mscclass
            None,  # acmclass
            None,  # reportno
            None,  # journalref
            None,  # comments
            "this is a test",  # description
            "category",  # categories
            "http://google.com",  # doi
            "2009-09-09"  # mdate
        )

        self.connector.execute_ex(ADD_DBLP_ARTICLE, dblp_article)
        self.connector.execute_ex(ADD_ARXIV, arxiv_article)
Exemplo n.º 11
0
def get_table_data(table, null_dates=True):
    credentials = dict(get_config("MARIADBX"))
    # connect to database
    connector = MariaDb(credentials)
    connector.connector.database = TESTDB
    # fetch everything
    query = "SELECT * FROM test_storage.{}".format(table)
    connector.cursor.execute(query)
    print(query)
    result = set()
    for dataset in connector.cursor:
        print(dataset)
        tmp = ()
        for element in dataset:
            # overwrite timestamps with generic date for easier testing
            if null_dates and isinstance(element, datetime.datetime):
                tmp += ((datetime.datetime(1990, 1, 1, 1, 1, 1), ))
            else:
                tmp += (element, )
        result.add(tmp)
    connector.close_connection()
    return result
Exemplo n.º 12
0
 def setUp(self):
     self.connector = MariaDb(db="test_storage")
     storage_engine = get_config("MISC")["storage_engine"]
     self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine))
Exemplo n.º 13
0
from oai.queries import OAI_DATASET
from mysqlWrapper.mariadb import MariaDb

DB_NAME = 'oaimph'
credentials = {
    'user': '******',
    'password': '******',
    'host': '127.0.0.1',
}

try:
    database = MariaDb(credentials)
except Exception as err:
    print(err)
else:
    database.create_db(DB_NAME)
    database.createTable("oaimph", OAI_DATASET)
    database.close_connection()



Exemplo n.º 14
0
def ingest_data(ingester_obj):
    if isinstance(ingester_obj, Iingester) is False:
        raise IIngester_Exception("Object is not of type IIngester")

    pub_added = 0
    pub_limbo = 0
    pub_duplicate = 0
    logger = logging.getLogger("ingester.{}".format(ingester_obj.get_name()))
    # mysql connector to read from harvester
    read_connector = MariaDb()
    write_connector = MariaDb()
    try:
        read_connector.cursor.execute(ingester_obj.get_query())
    except Exception as e:
        raise IIngester_Exception(e)

    globl_url_obj = global_url.objects.get(id=1)
    ingester_obj_global_url = ingester_obj.get_global_url()
    start_track = time()
    for query_dataset in read_connector.cursor:
        mapping = ingester_obj.mapping_function(query_dataset)
        write_connector.execute_ex(ingester_obj.update_harvested(), (mapping["local_url"],))
        try:
            # 1. get Harvester specific record and parse to common-form dict
            # ------------------------- LOCAL_URL ----------------------------------------------------------------------
            # check for duplicates by looking up the local URL
            try:
                local_url.objects.get(url=mapping["local_url"], global_url=ingester_obj_global_url)
                logger.info("%s: skip duplicate", mapping["local_url"])
                pub_duplicate += 1
                continue
            except ObjectDoesNotExist:
                pass

            # 2. create local url entry for record
            type_obj = match_type(mapping["publication"]["type_ids"])
            source_lurl_obj = local_url.objects.create(url=mapping["local_url"],
                                                       global_url=ingester_obj.get_global_url(),
                                                       type=type_obj)
            # ------------------------- MATCHINGS ----------------------------------------------------------------------
            # 3. find matching cluster for title and matching existing authors
            title_match = match_title2(mapping["publication"]["title"])
            author_matches = advanced_author_match(mapping["authors"])

            author_valid = True
            for author in author_matches:
                if author["status"] == Status.LIMBO:
                    author_valid = False
                    break

            # 4. ambiguous matching, push into limbo and delete local url record
            if title_match["status"] == Status.LIMBO or author_valid is False:
                logger.info("%s: Ambiguous title/authors", mapping["local_url"])
                source_lurl_obj.delete()
                push_limbo(mapping, author_matches, str(title_match["reason"]))
                pub_limbo += 1
                write_connector.execute_ex(ingester_obj.update_harvested(), (mapping["local_url"],))
                continue

            # ------------------------ CREATION ------------------------------------------------------------------------
            cluster_name = normalize_title(mapping["publication"]["title"])

            pub_medium_obj = match_pub_medium(mapping["pub_release"], source_lurl_obj)
            author_ids = create_authors(author_matches, mapping["authors"], source_lurl_obj)
            keyword_obj = match_keywords(mapping["keywords"],source_lurl_obj)
            cluster_obj = create_title(title_match, cluster_name)
            # 5.create default publication / or find existing one and link with authors and cluster
            def_pub_obj, def_url_obj = create_publication(globl_url_obj,
                                                          cluster_obj,
                                                          author_ids,
                                                          type_obj,
                                                          pub_medium_obj,
                                                          keyword_obj)
            # update local url with pub_medium_obj and study field
            source_lurl_obj.medium = pub_medium_obj
            source_lurl_obj.save()
            # 6.get /create diff tree
            mapping['publication']['url_id'] = source_lurl_obj.id
            # handle if there is no pub_medium
            if pub_medium_obj is not None:
                mapping['publication']['pub_source_ids'] = pub_medium_obj.id
            if len(keyword_obj) > 0:
                key_id_list = []
                for keyword in keyword_obj:
                    key_id_list.append(keyword.id)
                mapping['publication']['keyword_ids'] = key_id_list or None
            mapping['publication']['type_ids'] = type_obj.id
            diff_tree = update_diff_tree(def_pub_obj, mapping['publication'], author_ids)
            # 7.get default values from diff tree and re-serialize tree
            publication_values = get_default_values(diff_tree)
            get_default_ids(diff_tree, def_url_obj)

            serialized_tree = serialize_diff_store(diff_tree)
            # set missing values that are not default
            publication_values["differences"] = serialized_tree
            publication_values["cluster"] = cluster_obj
            publication_values["url"] = def_url_obj
            publication_values["date_published"] = datetime.date(publication_values["date_published"],1,1)
            # 8.store publication
            for key, value in publication_values.items():
                setattr(def_pub_obj, key, value)
            def_pub_obj.save()
            logger.debug("%s: Publication added %s", mapping["local_url"], def_pub_obj)
            # 9.set references for publication
            ingester_obj.set_reference(source_lurl_obj, mapping['local_url'])
            pub_added += 1

        except Exception as e:
            print(e)
            logger.error("%s: %s", mapping["local_url"], e)
            continue
    end_track = time()
    logger.error("TRACK:{}".format(end_track - start_track))
    logger.debug("Terminate ingester %s", ingester_obj.get_name())
    logger.info("publications added %s / limbo %s / skipped %s", pub_added,pub_limbo,pub_duplicate)
    read_connector.close_connection()
    write_connector.close_connection()
    return pub_added
Exemplo n.º 15
0
 def test_create_db(self):
     x = MariaDb()
     x.create_database(DB_NAME)
     x.create_database(DB_NAME)
     x.close_connection()
Exemplo n.º 16
0
 def test_init_success(self):
     MariaDb()