def dblp_simple_test(): path = "test_files/dblp1_test.xml" parse_dblp.parse(path, database_path="aip_test") with database.db: with database.db.cursor() as cursor: # Only the first two records should be added, with a total of 8 # authors cursor.execute('''SELECT COUNT(*) FROM publications''') res = cursor.fetchone()[0] assert res == 2 cursor.execute('''SELECT COUNT(*) FROM authors''') res = cursor.fetchone()[0] assert res == 8 cursor.execute('''SELECT COUNT(*) FROM author_paper_pairs''') res = cursor.fetchone()[0] assert res == 8 cursor.execute(''' SELECT author_position FROM author_paper_pairs''') # flatmap from: # https://stackoverflow.com/questions/1077015/python-list-comprehensions-compressing-a-list-of-lists res = [pos for app in cursor.fetchall() for pos in app] # check if the author positions have been correctly added assert res == [1, 2, 3, 4, 1, 2, 3, 4] db_cleanup()
def process_file(path): print(path) if 'local_lock' not in globals(): globals()['local_lock'] = None global local_lock ret_value = [False] if local_lock is None: local_lock = threading.Lock() local_lock.acquire() # Filename can't start with a number h = 'aipdb' + str(abs( hash(path))) # Use the hash of the node name as database file. print(h) tmp_path = os.path.join(tmp_folder, "{}.db".format(h)) os.makedirs(tmp_folder, exist_ok=True) if "dblp.xml" in path: ret_value = [parse_dblp.parse(path, tmp_path)] elif "s2-corpus" in path: ret_value = [ parse_semantic_scholar.parse_semantic_scholar_corpus_file( path, tmp_path) ] elif "aminer_papers" in path: ret_value = [parse_aminer.parse_aminer_corpus_file(path, tmp_path)] local_lock.release() return ret_value
def process_file(path, db_file="aip"): if re.match(".*dblp[\w-]+\.xml", path): return parse_dblp.parse(path, db_file) elif "s2-corpus" in path: return parse_semantic_scholar.parse_semantic_scholar_corpus_file( path, db_file) elif "aminer_papers" in path: return parse_aminer.parse_aminer_corpus_file(path, db_file, logger_disabled=True) return True # Nothing that should be done.
def process_file(path, db_file=aip_name): if re.match(".*dblp[\w-]+\.xml", path): return parse_dblp.parse(path, db_file) elif "aminer_papers" in path: start = time.time() ret = parse_aminer.parse_aminer_corpus_file(path, db_file, logger_disabled=True) print("Aminer parse time:", time.time() - start) return ret elif "mag_papers" in path: start = time.time() ret = parse_mag.parse_mag_corpus_file(path, db_file, logger_disabled=True) print("MAG parse time:", time.time() - start) return ret elif "s2-corpus" in path: return parse_semantic_scholar.parse_semantic_scholar_corpus_file( path, db_file) return True # Nothing that should be done.