def flush_data(args): run_sql("INSERT INTO `aidPERSONIDDATA` " "(`personid`, " " `tag`, " " `data`, " " `opt1`, " " `opt2`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5)) , tuple(args))
def flush_papers(args): run_sql("INSERT INTO `aidPERSONIDPAPERS` " "(`personid`, " " `bibref_table`, " " `bibref_value`, " " `bibrec`, " " `name`, " " `flag`, " " `lcul`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7)) , tuple(args))
def _get_ontology(ontology): """Returns the (name, path, url) to the short ontology name. @var ontology: name of the ontology or path to the file or url""" onto_name = onto_path = onto_url = None # first assume we got the path to the file if os.access(ontology, os.R_OK): onto_name = os.path.split(os.path.abspath(ontology))[1] onto_path = os.path.abspath(ontology) onto_url = "" else: # if not, try to find it in a known locations discovered_file = _discover_ontology(ontology) if discovered_file: onto_name = os.path.split(discovered_file)[1] onto_path = discovered_file # i know, this sucks x = ontology.lower() if "http:" in x or "https:" in x or "ftp:" in x or "file:" in x: onto_url = ontology else: onto_url = "" else: # not found, look into a database (it is last because when bibclassify # runs in a standalone mode, it has no database - [rca, old-heritage] if not bconfig.STANDALONE: result = dbquery.run_sql("SELECT name, location from clsMETHOD WHERE name LIKE %s", ('%'+ontology+'%',)) for onto_short_name, url in result: onto_name = onto_short_name onto_path = _get_remote_ontology(url) onto_url = url return (onto_name, onto_path, onto_url)
def convert_personid(): from dbquery import run_sql # oh come on, the whole function will be removed soon from itertools import repeat chunk = 1000 old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`") def flush_papers(args): run_sql("INSERT INTO `aidPERSONIDPAPERS` " "(`personid`, " " `bibref_table`, " " `bibref_value`, " " `bibrec`, " " `name`, " " `flag`, " " `lcul`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7)) , tuple(args)) def flush_data(args): run_sql("INSERT INTO `aidPERSONIDDATA` " "(`personid`, " " `tag`, " " `data`, " " `opt1`, " " `opt2`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5)) , tuple(args)) paper_args = [] data_args = [] for row in old_personid: if row[1] == 'paper': bibref, rec = row[2].split(',') tab, ref = bibref.split(':') try: name = get_name_by_bibrecref((int(tab), int(ref), int(rec))) except: continue name = split_name_parts(name) name = create_normalized_name(name) paper_args += [row[0], tab, ref, rec, name, row[3], row[4]] if len(paper_args) > chunk: flush_papers(paper_args) paper_args = [] elif row[1] == 'gathered_name': continue else: data_args += list(row) if len(data_args) > chunk: flush_data(data_args) data_args = [] if paper_args: flush_papers(paper_args) if data_args: flush_data(data_args)
def test_extract_using_recid(self): """bibclassify - extracting data from database (using recID to find fulltext)""" if not bconfig.STANDALONE: bibtask = bibclassify_daemon.bibtask #first test if the record exists in the database record = dbquery.run_sql("SELECT * FROM bibrec WHERE id=94") #print record if len(record): bibtask.task_set_task_param('verbose', 0) bibtask.task_set_task_param('task_id', 1) results = bibclassify_daemon._analyze_documents([94], self.taxonomy_name, "XXX", output_limit=100) res, msg = check_pdf3(results) if not res: self.fail(msg)
def _get_ontology(ontology): """Returns the (name, path, url) to the short ontology name. @var ontology: name of the ontology or path to the file or url""" onto_name = onto_path = onto_url = None # first assume we got the path to the file if os.access(ontology, os.R_OK): onto_name = os.path.split(os.path.abspath(ontology))[1] onto_path = os.path.abspath(ontology) onto_url = "" else: # if not, try to find it in a known locations discovered_file = _discover_ontology(ontology) if discovered_file: onto_name = os.path.split(discovered_file)[1] onto_path = discovered_file # i know, this sucks x = ontology.lower() if "http:" in x or "https:" in x or "ftp:" in x or "file:" in x: onto_url = ontology else: onto_url = "" else: # not found, look into a database (it is last because when bibclassify # runs in a standalone mode, it has no database - [rca, old-heritage] if not bconfig.STANDALONE: result = dbquery.run_sql( "SELECT name, location from clsMETHOD WHERE name LIKE %s", ('%' + ontology + '%', )) for onto_short_name, url in result: onto_name = onto_short_name onto_path = _get_remote_ontology(url) onto_url = url return (onto_name, onto_path, onto_url)
def _update_date_of_last_run(runtime): """Update bibclassify daemon table information about last run time.""" run_sql("UPDATE clsMETHOD SET last_updated=%s", (runtime, ))
def _collection_exists(collection_name): """Check if the collection name is registered in the database.""" if run_sql("SELECT name FROM collection WHERE name=%s", (collection_name,)): return True return False
def _recid_exists(recid): """Check if the recid number is registered in the database.""" if run_sql("SELECT id FROM bibrec WHERE id=%s", (recid,)): return True return False
def _update_date_of_last_run(runtime): """Update bibclassify daemon table information about last run time.""" run_sql("UPDATE clsMETHOD SET last_updated=%s", (runtime,))
def _ontology_exists(ontology_name): """Check if the ontology name is registered in the database.""" if run_sql("SELECT name FROM clsMETHOD WHERE name=%s", (ontology_name,)): return True return False
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None): """Returns an array containing hash objects containing the collection, its corresponding ontology and the records belonging to the given collection.""" rec_onts = [] # User specified record IDs. if recids: rec_onts.append({ 'ontology': taxonomy, 'collection': None, 'recIDs': recids, }) return rec_onts # User specified collections. if collections: for collection in collections: records = get_collection_reclist(collection) if records: rec_onts.append({ 'ontology': taxonomy, 'collection': collection, 'recIDs': records }) return rec_onts # Use rules found in collection_clsMETHOD. result = run_sql("SELECT clsMETHOD.name, clsMETHOD.last_updated, " "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON " "clsMETHOD.id=id_clsMETHOD JOIN collection ON " "id_collection=collection.id") for ontology, date_last_run, collection in result: records = get_collection_reclist(collection) if records: if not date_last_run: bibtask.write_message("INFO: Collection %s has not been previously " "analyzed." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) elif bibtask.task_get_option('force'): bibtask.write_message("INFO: Analysis is forced for collection %s." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) else: modified_records = intbitset(run_sql("SELECT id FROM bibrec " "WHERE modification_date >= %s", (date_last_run, ))) records &= modified_records if records: rec_onts.append({ 'ontology': ontology, 'collection': collection, 'recIDs': records }) else: bibtask.write_message("WARNING: All records from collection '%s' have " "already been analyzed for keywords with ontology '%s' " "on %s." % (collection, ontology, date_last_run), stream=sys.stderr, verbose=2) else: bibtask.write_message("ERROR: Collection '%s' doesn't contain any record. " "Cannot analyse keywords." % (collection,), stream=sys.stderr, verbose=0) return rec_onts
def _ontology_exists(ontology_name): """Check if the ontology name is registered in the database.""" if run_sql("SELECT name FROM clsMETHOD WHERE name=%s", (ontology_name, )): return True return False
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None): """Returns an array containing hash objects containing the collection, its corresponding ontology and the records belonging to the given collection.""" rec_onts = [] # User specified record IDs. if recids: rec_onts.append({ 'ontology': taxonomy, 'collection': None, 'recIDs': recids, }) return rec_onts # User specified collections. if collections: for collection in collections: records = get_collection_reclist(collection) if records: rec_onts.append({ 'ontology': taxonomy, 'collection': collection, 'recIDs': records }) return rec_onts # Use rules found in collection_clsMETHOD. result = run_sql( "SELECT clsMETHOD.name, clsMETHOD.last_updated, " "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON " "clsMETHOD.id=id_clsMETHOD JOIN collection ON " "id_collection=collection.id") for ontology, date_last_run, collection in result: records = get_collection_reclist(collection) if records: if not date_last_run: bibtask.write_message( "INFO: Collection %s has not been previously " "analyzed." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) elif bibtask.task_get_option('force'): bibtask.write_message( "INFO: Analysis is forced for collection %s." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) else: modified_records = intbitset( run_sql( "SELECT id FROM bibrec " "WHERE modification_date >= %s", (date_last_run, ))) records &= modified_records if records: rec_onts.append({ 'ontology': ontology, 'collection': collection, 'recIDs': records }) else: bibtask.write_message( "WARNING: All records from collection '%s' have " "already been analyzed for keywords with ontology '%s' " "on %s." % (collection, ontology, date_last_run), stream=sys.stderr, verbose=2) else: bibtask.write_message( "ERROR: Collection '%s' doesn't contain any record. " "Cannot analyse keywords." % (collection, ), stream=sys.stderr, verbose=0) return rec_onts
def _recid_exists(recid): """Check if the recid number is registered in the database.""" if run_sql("SELECT id FROM bibrec WHERE id=%s", (recid, )): return True return False
def _collection_exists(collection_name): """Check if the collection name is registered in the database.""" if run_sql("SELECT name FROM collection WHERE name=%s", (collection_name, )): return True return False