Пример #1
0
def setup():
    NORMAL_TITLES = ("CREATE TABLE `Authors` ("
                     " `Id` INT NOT NULL,"
                     " `main_name` TEXT,"
                     " `normal_name` TEXT,"
                     " `metaphone_name` TEXT,"
                     "  PRIMARY KEY (`Id`)"
                     ") ENGINE= {} CHARSET=utf8mb4")
    connector = MariaDb()
    storage_engine = get_config("MISC")["storage_engine"]
    connector.create_database(DB_NAME)
    connector.createTable("dvfds", NORMAL_TITLES.format(storage_engine))
    try:
        connector.execute_ex(
            "CREATE FULLTEXT INDEX main_name_idx  ON Authors (main_name)", ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX normal_name_idx  ON Authors (normal_name)",
            ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX metaphone_name_idx  ON Authors (metaphone_name)",
            ())
    except:
        print("Index already exists")

    connector.close_connection()
Пример #2
0
def setup():
    """
    create database and table structure
    :return:
    """
    connector = MariaDb()

    storage_engine = get_config("MISC")["storage_engine"]

    # create database
    connector.create_database(DB_NAME)
    connector.createTable("dates", DATE_TABLE.format(storage_engine))
    connector.createTable("publication year", PUB_YEAR_TABLE.format(storage_engine))
    connector.createTable("popular_words", POPULAR.format(storage_engine))
    connector.createTable("title_length", TITLE_LENGTH.format(storage_engine))
    connector.createTable("popular names", N_POPULAR.format(storage_engine))
    connector.createTable("number authors", NUM_AUTHOR.format(storage_engine))
    connector.createTable("Authors", AUTHORS.format(storage_engine))
    connector.createTable("Normal Titles", NORMAL_TITLES.format(storage_engine))
    connector.createTable("Career",CAREER.format(storage_engine))
    # create index
    try:
        connector.execute_ex("CREATE FULLTEXT INDEX title_idx  ON normal_title (titles)", ())
    except:
        print("Index already exists")

    connector.close_connection()
Пример #3
0
def setup_tables(filename, table_query, insert_query):
    # load testconfig
    credentials = dict(get_config("MARIADBX"))
    # setup database
    connector = MariaDb(credentials)
    connector.create_db(TESTDB)
    connector.connector.database = TESTDB
    connector.createTable("test dblp table", table_query)

    # setup test ingester database
    # setup_database(TESTDB)
    # import records from csv
    with open(filename, newline='', encoding='utf-8') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=';', quotechar='"')
        do_once = False
        for row in spamreader:
            # remove last updated and harvest date
            del row[-2:]
            # skip first line
            if do_once is True:
                tup = tuple(map(lambda x: x if x != "" else None, row))
                connector.execute_ex(insert_query, tup)
            else:
                do_once = True
    connector.close_connection()
Пример #4
0
 def test_create_table(self):
     x = MariaDb()
     x.create_database(DB_NAME)
     x.createTable("test", ("CREATE TABLE `muhz` ("
                            "  `dblp_key` varchar(100) NOT NULL,"
                            "  PRIMARY KEY (`dblp_key`)"
                            ") ENGINE={} CHARSET=utf8mb4"))
     x.close_connection()
Пример #5
0
 def tearDownClass(cls):
     connector = MariaDb(db="test_storage")
     connector.execute_ex(
         "ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx",
         ())
     connector.execute_ex(
         "ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx",
         ())
     connector.close_connection()
Пример #6
0
 def setUpClass(cls):
     connector = MariaDb(db="test_storage")
     connector.execute_ex((
         "CREATE FULLTEXT INDEX cluster_ft_idx  ON test_storage.ingester_cluster (name)"
     ), ())
     connector.execute_ex(
         "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)",
         ())
     connector.close_connection()
Пример #7
0
def setup(TABLE_NAME):
    """
    create database and table structure
    :return:
    """
    connector = MariaDb()
    storage_engine = get_config("MISC")["storage_engine"]
    # create database
    connector.create_database(DB_NAME)
    connector.createTable(TABLE_NAME, TITLES.format(TABLE_NAME,storage_engine))
    connector.close_connection()
Пример #8
0
 def test_execute_ex(self):
     x = MariaDb()
     x.create_database(DB_NAME)
     x.createTable("test", ("CREATE TABLE `muhz` ("
                            "  `ID` int NOT NULL AUTO_INCREMENT,"
                            "  `dblp_key` varchar(100) NOT NULL,"
                            "  PRIMARY KEY (`ID`)"
                            ") ENGINE={} CHARSET=utf8mb4"))
     idx = x.execute_ex("INSERT INTO muhz (dblp_key) VALUES (%s)", ('mi'))
     self.assertEqual(idx, 1)
     x.close_connection()
Пример #9
0
def get_table_data(table, null_dates=True):
    credentials = dict(get_config("MARIADBX"))
    # connect to database
    connector = MariaDb(credentials)
    connector.connector.database = TESTDB
    # fetch everything
    query = "SELECT * FROM test_storage.{}".format(table)
    connector.cursor.execute(query)
    print(query)
    result = set()
    for dataset in connector.cursor:
        print(dataset)
        tmp = ()
        for element in dataset:
            # overwrite timestamps with generic date for easier testing
            if null_dates and isinstance(element, datetime.datetime):
                tmp += ((datetime.datetime(1990, 1, 1, 1, 1, 1), ))
            else:
                tmp += (element, )
        result.add(tmp)
    connector.close_connection()
    return result
Пример #10
0
class TestIngesterMulti2(TransactionTestCase):
    fixtures = [os.path.join(ingester_path, "fixtures", "initial_data.json")]

    @classmethod
    def setUpClass(cls):
        connector = MariaDb(db="test_storage")
        connector.execute_ex((
            "CREATE FULLTEXT INDEX cluster_ft_idx  ON test_storage.ingester_cluster (name)"
        ), ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)",
            ())
        connector.close_connection()

    @classmethod
    def tearDownClass(cls):
        connector = MariaDb(db="test_storage")
        connector.execute_ex(
            "ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx",
            ())
        connector.execute_ex(
            "ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx",
            ())
        connector.close_connection()

    def setUp(self):
        self.connector = MariaDb(db="test_storage")
        storage_engine = get_config("MISC")["storage_engine"]
        # create tables for both sources arxiv and dblp
        self.connector.createTable("dblparticle",
                                   DBLP_ARTICLE.format(storage_engine))
        self.connector.createTable("arxivarticle",
                                   ARXIV_ARTICLE.format(storage_engine))
        # insert data
        dblp_article = (
            "dblpkey",  # key
            "2011-11-11",  # mdate
            "Andreas Anders;Bertha Theresa Balte;",  # authors
            "The Ultimate Title",  # title
            "10-14",  # pages
            datetime.date(2005, 1, 1),  # pub year
            "2",  # volume
            "journal of stuff",  # journal
            "3",  # journal number
            "http://google.de",  # doi
            "http://unused.com",  # unused url
            None,  # cite
            None,  # crossref
            None,  # booktitle
            None,  # school
            None,  # address
            None,  # publisher
            None,  # isbn
            None,  # series
            "article"  # type
        )

        arxiv_article = (
            "arxivkey",  # identifier
            "2007-07-07",  # created
            "2008-08-08",  # updated
            "Andreas Anders;Bertha Theresa Balte;Carim Chass Jr.;",  # authors
            "The Ultimate Title!",  # title
            None,  # mscclass
            None,  # acmclass
            None,  # reportno
            None,  # journalref
            None,  # comments
            "this is a test",  # description
            "category",  # categories
            "http://google.com",  # doi
            "2009-09-09"  # mdate
        )

        self.connector.execute_ex(ADD_DBLP_ARTICLE, dblp_article)
        self.connector.execute_ex(ADD_ARXIV, arxiv_article)

    def tearDown(self):
        self.connector.execute_ex("DROP TABLE test_storage.arxiv_articles")
        self.connector.execute_ex("DROP TABLE test_storage.dblp_article")
        self.connector.close_connection()

    def test_success_reversed(self):
        dblpingester = DblpIngester("dblp.ingester",
                                    harvesterdb="test_storage")
        arxivingester = ArxivIngester("arxiv.ingester",
                                      harvester_db="test_storage")

        # arxiv first then dblp
        result2 = ingest_data(arxivingester)
        self.assertEqual(result2, 1)
        result = ingest_data(dblpingester)
        self.assertEqual(result, 1)

        # check all tables
        self.assertEqual(cluster.objects.count(), 1)
        self.assertEqual(publication.objects.count(), 1)
        self.assertEqual(local_url.objects.count(), 3)
        self.assertEqual(global_url.objects.count(), 4)
        self.assertEqual(limbo_authors.objects.count(), 0)
        self.assertEqual(limbo_pub.objects.count(), 0)
        self.assertEqual(pub_medium.objects.count(), 1)
        # check local url
        dblp_url = local_url.objects.get(id=3)
        pub_url = local_url.objects.get(id=2)
        arxiv_url = local_url.objects.get(id=1)
        self.assertEqual(dblp_url.test(), [
            3, "dblpkey", 1,
            publication_type.objects.get(name="article").id, None
        ])
        self.assertEqual(arxiv_url.test(), [
            4, "arxivkey", None,
            publication_type.objects.get(name="misc").id, None
        ])
        self.assertEqual(pub_url.test(), [
            1, "TODO PLATZHALTER", 1,
            publication_type.objects.get(name="misc").id, None
        ])
        # check authors
        self.assertEqual(authors_model.objects.count(), 3)
        self.assertEqual(author_aliases.objects.count(), 3)
        self.assertEqual(author_alias_source.objects.count(), 5)
        # publication authors
        self.assertEqual(publication_author.objects.count(), 8)
        # check publication
        publ = publication.objects.first()
        self.assertEqual(publ.title, "The Ultimate Title!")  # from Arxiv
        self.assertEqual(publ.pages, "10-14")  # DBLP
        self.assertEqual(publ.note, None)
        self.assertEqual(publ.doi, "http://google.com")  # Arxiv
        self.assertEqual(publ.abstract, "this is a test")  # arxiv
        self.assertEqual(publ.copyright, None)
        self.assertEqual(publ.date_added, None)
        self.assertEqual(publ.date_published, datetime.date(2007, 1,
                                                            1))  # DBLP
        self.assertEqual(publ.volume, "2")  # DBLP
        self.assertEqual(publ.number, "3")  # DBLP
        # check diff tree
        diff = deserialize_diff_store(publ.differences)
        self.assertEqual(diff["url_id"], [1, 3])
        self.assertEqual(diff["doi"], [{
            "bitvector": 1,
            "votes": 0,
            "value": "http://google.com"
        }, {
            "bitvector": 2,
            "votes": 0,
            "value": "http://google.de"
        }])
        self.assertEqual(diff["copyright"], [])
        self.assertEqual(diff["type_ids"], [{
            "bitvector": 1,
            "votes": 0,
            "value": 2
        }, {
            "bitvector": 2,
            "votes": 0,
            "value": 1
        }])
        self.assertEqual(diff["pages"], [{
            "bitvector": 2,
            "votes": 0,
            "value": "10-14"
        }])

        self.assertEqual(OpenReferences.objects.first().test(),
                         [1, 'arxivkey', None])
Пример #11
0
class TestIngester(TransactionTestCase):
    fixtures = [os.path.join(ingester_path, "fixtures", "initial_data.json")]

    @classmethod
    def setUpClass(cls):
        connector = MariaDb(db="test_storage")
        connector.execute_ex("CREATE FULLTEXT INDEX cluster_ft_idx  ON test_storage.ingester_cluster (name)", ())
        connector.execute_ex(
            "CREATE FULLTEXT INDEX authors_model_ft_idx ON test_storage.ingester_authors_model (block_name)", ())
        connector.close_connection()

    @classmethod
    def tearDownClass(cls):
        connector = MariaDb(db="test_storage")
        connector.execute_ex("ALTER TABLE test_storage.ingester_cluster DROP INDEX cluster_ft_idx", ())
        connector.execute_ex("ALTER TABLE test_storage.ingester_authors_model DROP INDEX authors_model_ft_idx", ())
        connector.close_connection()

    def setUp(self):
        self.connector = MariaDb(db="test_storage")
        storage_engine = get_config("MISC")["storage_engine"]
        self.connector.createTable("dblparticle", DBLP_ARTICLE.format(storage_engine))

    def tearDown(self):
        self.connector.execute_ex("DROP TABLE test_storage.dblp_article")
        self.connector.close_connection()

    def test_invalid_ingester(self):
        setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        self.assertRaises(IIngester_Exception, ingest_data, datetime.datetime(1990,1,1,1,1,1))

    def test_success(self):
        setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        self.assertEqual(ingester.get_global_url().id, 3)
        result = ingest_data(ingester)
        self.assertEqual(result, 2)
        # check local url
        self.assertEqual(local_url.objects.get(id=1).test(), [3, 'journals/acta/AkyildizB89', 1, 1,None])
        self.assertEqual(local_url.objects.get(id=2).test(), [1, 'TODO PLATZHALTER', 1, 1,None])
        self.assertEqual(local_url.objects.get(id=3).test(), [3, 'journals/acta/VoglerS014', 1, 1,None])
        self.assertEqual(local_url.objects.get(id=4).test(), [1, 'TODO PLATZHALTER', 1, 1,None])
        # check authors_model
        self.assertEqual(authors_model.objects.get(id=1).test(),["Ian F. Akyildiz", "ian f akyildiz"])
        self.assertEqual(authors_model.objects.get(id=2).test(), ["Horst von Brand", "horst von brand"])
        self.assertEqual(authors_model.objects.get(id=3).test(), ["Walter Vogler", "walter vogler"])
        self.assertEqual(authors_model.objects.get(id=4).test(), ["Christian Stahl", "christian stahl"])
        self.assertEqual(authors_model.objects.get(id=5).test(), ["Richard Müller", "richard muller"])
        # check author alias
        self.assertEqual(author_aliases.objects.get(id=1).test(), [1, "Ian F. Akyildiz"])
        self.assertEqual(author_aliases.objects.get(id=2).test(), [2, "Horst von Brand"])
        self.assertEqual(author_aliases.objects.get(id=3).test(), [3, "Walter Vogler"])
        self.assertEqual(author_aliases.objects.get(id=4).test(), [4, "Christian Stahl"])
        self.assertEqual(author_aliases.objects.get(id=5).test(), [5, "Richard Müller 0001"])
        self.assertEqual(author_aliases.objects.get(id=6).test(), [5, "Richard Müller"])
        # cluster
        self.assertEqual(cluster.objects.get(id=1).name, "bla bla bla")
        self.assertEqual(cluster.objects.get(id=2).name, "kam kim kum")
        # author alias source
        self.assertEqual(author_alias_source.objects.get(id=1).test(), [1, 1])
        self.assertEqual(author_alias_source.objects.get(id=2).test(), [2, 1])
        self.assertEqual(author_alias_source.objects.get(id=3).test(), [3, 3])
        self.assertEqual(author_alias_source.objects.get(id=4).test(), [4, 3])
        self.assertEqual(author_alias_source.objects.get(id=5).test(), [5, 3])
        self.assertEqual(author_alias_source.objects.get(id=6).test(), [6, 3])
        # publication authors
        self.assertEqual(publication_author.objects.get(id=1).test(), [1, 1, 0])
        self.assertEqual(publication_author.objects.get(id=2).test(), [1, 2, 1])
        self.assertEqual(publication_author.objects.get(id=3).test(), [2, 1, 0])
        self.assertEqual(publication_author.objects.get(id=4).test(), [2, 2, 1])
        self.assertEqual(publication_author.objects.get(id=5).test(), [3, 3, 0])
        self.assertEqual(publication_author.objects.get(id=6).test(), [3, 4, 1])
        self.assertEqual(publication_author.objects.get(id=7).test(), [3, 5, 2])
        self.assertEqual(publication_author.objects.get(id=8).test(), [4, 3, 0])
        self.assertEqual(publication_author.objects.get(id=9).test(), [4, 4, 1])
        self.assertEqual(publication_author.objects.get(id=10).test(), [4, 5, 2])

        # limbo
        self.assertEqual(limbo_authors.objects.count(),0)
        self.assertEqual(limbo_pub.objects.count(),0)

        # publication
        self.assertEqual(publication.objects.get(id=1).test(), [1, "Bla Bla Bla"])
        self.assertEqual(publication.objects.get(id=2).test(), [2, "Kam? Kim! Kum."])
        # check if last harvested is set
        tmp = list(get_table_data("dblp_article", null_dates=False))
        self.assertEqual(tmp[0][-1].strftime("%Y-%m-%d"), datetime.datetime.now().strftime("%Y-%m-%d"))

        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_success_limit(self):
        setup_tables(os.path.join(test_path, "dblp_test1.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingester.set_limit(1)
        result = ingest_data(ingester)
        self.assertEqual(result, 1)
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_complete_publication(self):
        # for this test a dataset with ALL ROWS filled, will be created to check if all values are
        # successfully transferred
        setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingest_data(ingester)
        publ = publication.objects.first()
        self.assertEqual(publ.title,"title")
        self.assertEqual(publ.pages, "1-5")
        self.assertEqual(publ.doi, "doi")
        self.assertEqual(publ.abstract, None)
        self.assertEqual(publ.copyright, None)
        self.assertEqual(publ.volume, "1")
        self.assertEqual(publ.number, "2")
        self.assertEqual(publ.note, None)
        self.assertEqual(publ.date_added, None)
        self.assertEqual(publ.date_published, datetime.date(1990,1,1))
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_limbo_multi_cluster(self):
        setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        cluster.objects.bulk_create([
            cluster(id=1, name="title"),
            cluster(id=2, name="title"),
        ])
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingest_data(ingester)
        self.assertEqual(limbo_authors.objects.get(id=1).test(), [1, 'None', "An Author", 0])
        self.assertEqual(limbo_authors.objects.get(id=2).test(), [1, 'None', "Another Author", 1])
        self.assertEqual(local_url.objects.count(),0)
        limbo = limbo_pub.objects.get(id=1).test_extended()
        print(limbo)
        compare = ['Reason.AMB_CLUSTER','key',"title","1-5",None,"doi",None,None,
                                None,datetime.date(1990,1,1),"1","2","series",
                                None,"publisher",None,"school","address",
                                "isbn",None,"booktitle","journal"]
        self.assertEqual(limbo,compare)

    def test_limbo_multi_pubs(self):
        setup_tables(os.path.join(test_path, "dblp_test2.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        cl = cluster.objects.create(id=1, name="title")
        gurl = global_url.objects.create(id=5,domain ="http://dummy.de", url="http://dummy.de")
        lurl = local_url.objects.create(id=1,url="jlkjöl", global_url=gurl)
        publication.objects.bulk_create([
            publication(local_url=lurl,cluster=cl,title="Title"),
            publication(local_url=lurl, cluster=cl, title="Title")
        ])
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingest_data(ingester)
        limbo = limbo_pub.objects.get(id=1).test_extended()
        self.assertEqual(limbo[0],'Reason.AMB_PUB')
        self.assertEqual(limbo_authors.objects.get(id=1).test(), [1, 'None', "An Author", 0])
        self.assertEqual(limbo_authors.objects.get(id=2).test(), [1, 'None', "Another Author", 1])
        self.assertEqual(local_url.objects.count(),1)
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_limbo_alias(self):
        setup_tables(os.path.join(test_path, "dblp_test3.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingest_data(ingester)

        self.assertEqual(limbo_pub.objects.count(), 0)
        self.assertEqual(cluster.objects.count(), 3)
        self.assertEqual(authors_model.objects.count(), 5)
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)

    def test_set_last_harvested(self):
        setup_tables(os.path.join(test_path, "dblp_test3.csv"), DBLP_ARTICLE, ADD_DBLP_ARTICLE)
        ingester = DblpIngester("dblp.ingester", harvesterdb="test_storage")
        ingester.set_limit(1)
        result1 = ingest_data(ingester)
        self.assertEqual(result1, 1)
        ingester.set_limit(3)
        result2 = ingest_data(ingester)
        self.assertEqual(result2, 2)
        # check open references
        self.assertEqual(OpenReferences.objects.count(), 0)
Пример #12
0
from oai.queries import OAI_DATASET
from mysqlWrapper.mariadb import MariaDb

DB_NAME = 'oaimph'
credentials = {
    'user': '******',
    'password': '******',
    'host': '127.0.0.1',
}

try:
    database = MariaDb(credentials)
except Exception as err:
    print(err)
else:
    database.create_db(DB_NAME)
    database.createTable("oaimph", OAI_DATASET)
    database.close_connection()



Пример #13
0
def ingest_data(ingester_obj):
    if isinstance(ingester_obj, Iingester) is False:
        raise IIngester_Exception("Object is not of type IIngester")

    pub_added = 0
    pub_limbo = 0
    pub_duplicate = 0
    logger = logging.getLogger("ingester.{}".format(ingester_obj.get_name()))
    # mysql connector to read from harvester
    read_connector = MariaDb()
    write_connector = MariaDb()
    try:
        read_connector.cursor.execute(ingester_obj.get_query())
    except Exception as e:
        raise IIngester_Exception(e)

    globl_url_obj = global_url.objects.get(id=1)
    ingester_obj_global_url = ingester_obj.get_global_url()
    start_track = time()
    for query_dataset in read_connector.cursor:
        mapping = ingester_obj.mapping_function(query_dataset)
        write_connector.execute_ex(ingester_obj.update_harvested(), (mapping["local_url"],))
        try:
            # 1. get Harvester specific record and parse to common-form dict
            # ------------------------- LOCAL_URL ----------------------------------------------------------------------
            # check for duplicates by looking up the local URL
            try:
                local_url.objects.get(url=mapping["local_url"], global_url=ingester_obj_global_url)
                logger.info("%s: skip duplicate", mapping["local_url"])
                pub_duplicate += 1
                continue
            except ObjectDoesNotExist:
                pass

            # 2. create local url entry for record
            type_obj = match_type(mapping["publication"]["type_ids"])
            source_lurl_obj = local_url.objects.create(url=mapping["local_url"],
                                                       global_url=ingester_obj.get_global_url(),
                                                       type=type_obj)
            # ------------------------- MATCHINGS ----------------------------------------------------------------------
            # 3. find matching cluster for title and matching existing authors
            title_match = match_title2(mapping["publication"]["title"])
            author_matches = advanced_author_match(mapping["authors"])

            author_valid = True
            for author in author_matches:
                if author["status"] == Status.LIMBO:
                    author_valid = False
                    break

            # 4. ambiguous matching, push into limbo and delete local url record
            if title_match["status"] == Status.LIMBO or author_valid is False:
                logger.info("%s: Ambiguous title/authors", mapping["local_url"])
                source_lurl_obj.delete()
                push_limbo(mapping, author_matches, str(title_match["reason"]))
                pub_limbo += 1
                write_connector.execute_ex(ingester_obj.update_harvested(), (mapping["local_url"],))
                continue

            # ------------------------ CREATION ------------------------------------------------------------------------
            cluster_name = normalize_title(mapping["publication"]["title"])

            pub_medium_obj = match_pub_medium(mapping["pub_release"], source_lurl_obj)
            author_ids = create_authors(author_matches, mapping["authors"], source_lurl_obj)
            keyword_obj = match_keywords(mapping["keywords"],source_lurl_obj)
            cluster_obj = create_title(title_match, cluster_name)
            # 5.create default publication / or find existing one and link with authors and cluster
            def_pub_obj, def_url_obj = create_publication(globl_url_obj,
                                                          cluster_obj,
                                                          author_ids,
                                                          type_obj,
                                                          pub_medium_obj,
                                                          keyword_obj)
            # update local url with pub_medium_obj and study field
            source_lurl_obj.medium = pub_medium_obj
            source_lurl_obj.save()
            # 6.get /create diff tree
            mapping['publication']['url_id'] = source_lurl_obj.id
            # handle if there is no pub_medium
            if pub_medium_obj is not None:
                mapping['publication']['pub_source_ids'] = pub_medium_obj.id
            if len(keyword_obj) > 0:
                key_id_list = []
                for keyword in keyword_obj:
                    key_id_list.append(keyword.id)
                mapping['publication']['keyword_ids'] = key_id_list or None
            mapping['publication']['type_ids'] = type_obj.id
            diff_tree = update_diff_tree(def_pub_obj, mapping['publication'], author_ids)
            # 7.get default values from diff tree and re-serialize tree
            publication_values = get_default_values(diff_tree)
            get_default_ids(diff_tree, def_url_obj)

            serialized_tree = serialize_diff_store(diff_tree)
            # set missing values that are not default
            publication_values["differences"] = serialized_tree
            publication_values["cluster"] = cluster_obj
            publication_values["url"] = def_url_obj
            publication_values["date_published"] = datetime.date(publication_values["date_published"],1,1)
            # 8.store publication
            for key, value in publication_values.items():
                setattr(def_pub_obj, key, value)
            def_pub_obj.save()
            logger.debug("%s: Publication added %s", mapping["local_url"], def_pub_obj)
            # 9.set references for publication
            ingester_obj.set_reference(source_lurl_obj, mapping['local_url'])
            pub_added += 1

        except Exception as e:
            print(e)
            logger.error("%s: %s", mapping["local_url"], e)
            continue
    end_track = time()
    logger.error("TRACK:{}".format(end_track - start_track))
    logger.debug("Terminate ingester %s", ingester_obj.get_name())
    logger.info("publications added %s / limbo %s / skipped %s", pub_added,pub_limbo,pub_duplicate)
    read_connector.close_connection()
    write_connector.close_connection()
    return pub_added
Пример #14
0
 def test_create_db(self):
     x = MariaDb()
     x.create_database(DB_NAME)
     x.create_database(DB_NAME)
     x.close_connection()