def flush_data(args):
     run_sql("INSERT INTO `aidPERSONIDDATA` "
             "(`personid`, "
             " `tag`, "
             " `data`, "
             " `opt1`, "
             " `opt2`) "
             "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
            , tuple(args))
Exemplo n.º 2
0
 def flush_data(args):
     run_sql("INSERT INTO `aidPERSONIDDATA` "
             "(`personid`, "
             " `tag`, "
             " `data`, "
             " `opt1`, "
             " `opt2`) "
             "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
            , tuple(args))
 def flush_papers(args):
     run_sql("INSERT INTO `aidPERSONIDPAPERS` "
             "(`personid`, "
             " `bibref_table`, "
             " `bibref_value`, "
             " `bibrec`, "
             " `name`, "
             " `flag`, "
             " `lcul`) "
             "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
             , tuple(args))
Exemplo n.º 4
0
 def flush_papers(args):
     run_sql("INSERT INTO `aidPERSONIDPAPERS` "
             "(`personid`, "
             " `bibref_table`, "
             " `bibref_value`, "
             " `bibrec`, "
             " `name`, "
             " `flag`, "
             " `lcul`) "
             "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
             , tuple(args))
def _get_ontology(ontology):
    """Returns the (name, path, url) to the short ontology name.
    @var ontology: name of the ontology or path to the file or url"""

    onto_name = onto_path = onto_url = None

    # first assume we got the path to the file
    if os.access(ontology, os.R_OK):
        onto_name = os.path.split(os.path.abspath(ontology))[1]
        onto_path = os.path.abspath(ontology)
        onto_url = ""
    else:
        # if not, try to find it in a known locations
        discovered_file = _discover_ontology(ontology)
        if discovered_file:
            onto_name = os.path.split(discovered_file)[1]
            onto_path = discovered_file
            # i know, this sucks
            x = ontology.lower()
            if "http:" in x or "https:" in x or "ftp:" in x or "file:" in x:
                onto_url = ontology
            else:
                onto_url = ""
        else:
            # not found, look into a database (it is last because when bibclassify
            # runs in a standalone mode, it has no database - [rca, old-heritage]
            if not bconfig.STANDALONE:
                result = dbquery.run_sql("SELECT name, location from clsMETHOD WHERE name LIKE %s", ('%'+ontology+'%',))
                for onto_short_name, url in result:
                    onto_name = onto_short_name
                    onto_path = _get_remote_ontology(url)
                    onto_url = url

    return (onto_name, onto_path, onto_url)
Exemplo n.º 6
0
def convert_personid():
    from dbquery import run_sql # oh come on, the whole function will be removed soon
    from itertools import repeat
    chunk = 1000

    old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`")

    def flush_papers(args):
        run_sql("INSERT INTO `aidPERSONIDPAPERS` "
                "(`personid`, "
                " `bibref_table`, "
                " `bibref_value`, "
                " `bibrec`, "
                " `name`, "
                " `flag`, "
                " `lcul`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
                , tuple(args))

    def flush_data(args):
        run_sql("INSERT INTO `aidPERSONIDDATA` "
                "(`personid`, "
                " `tag`, "
                " `data`, "
                " `opt1`, "
                " `opt2`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
               , tuple(args))

    paper_args = []
    data_args = []
    for row in old_personid:
        if row[1] == 'paper':
            bibref, rec = row[2].split(',')
            tab, ref = bibref.split(':')
            try:
                name = get_name_by_bibrecref((int(tab), int(ref), int(rec)))
            except:
                continue
            name = split_name_parts(name)
            name = create_normalized_name(name)
            paper_args += [row[0], tab, ref, rec, name, row[3], row[4]]
            if len(paper_args) > chunk:
                flush_papers(paper_args)
                paper_args = []

        elif row[1] == 'gathered_name':
            continue
        else:
            data_args += list(row)
            if len(data_args) > chunk:
                flush_data(data_args)
                data_args = []

    if paper_args:
        flush_papers(paper_args)

    if data_args:
        flush_data(data_args)
def convert_personid():
    from dbquery import run_sql # oh come on, the whole function will be removed soon
    from itertools import repeat
    chunk = 1000

    old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`")

    def flush_papers(args):
        run_sql("INSERT INTO `aidPERSONIDPAPERS` "
                "(`personid`, "
                " `bibref_table`, "
                " `bibref_value`, "
                " `bibrec`, "
                " `name`, "
                " `flag`, "
                " `lcul`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7))
                , tuple(args))

    def flush_data(args):
        run_sql("INSERT INTO `aidPERSONIDDATA` "
                "(`personid`, "
                " `tag`, "
                " `data`, "
                " `opt1`, "
                " `opt2`) "
                "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5))
               , tuple(args))

    paper_args = []
    data_args = []
    for row in old_personid:
        if row[1] == 'paper':
            bibref, rec = row[2].split(',')
            tab, ref = bibref.split(':')
            try:
                name = get_name_by_bibrecref((int(tab), int(ref), int(rec)))
            except:
                continue
            name = split_name_parts(name)
            name = create_normalized_name(name)
            paper_args += [row[0], tab, ref, rec, name, row[3], row[4]]
            if len(paper_args) > chunk:
                flush_papers(paper_args)
                paper_args = []

        elif row[1] == 'gathered_name':
            continue
        else:
            data_args += list(row)
            if len(data_args) > chunk:
                flush_data(data_args)
                data_args = []

    if paper_args:
        flush_papers(paper_args)

    if data_args:
        flush_data(data_args)
Exemplo n.º 8
0
    def test_extract_using_recid(self):
        """bibclassify  - extracting data from database (using recID to find fulltext)"""
        if not bconfig.STANDALONE:
            bibtask = bibclassify_daemon.bibtask
            #first test if the record exists in the database
            record = dbquery.run_sql("SELECT * FROM bibrec WHERE id=94")
            #print record
            if len(record):

                bibtask.task_set_task_param('verbose', 0)
                bibtask.task_set_task_param('task_id', 1)

                results = bibclassify_daemon._analyze_documents([94], self.taxonomy_name, "XXX", output_limit=100)

            res, msg = check_pdf3(results)
            if not res:
                self.fail(msg)
def _get_ontology(ontology):
    """Returns the (name, path, url) to the short ontology name.
    @var ontology: name of the ontology or path to the file or url"""

    onto_name = onto_path = onto_url = None

    # first assume we got the path to the file
    if os.access(ontology, os.R_OK):
        onto_name = os.path.split(os.path.abspath(ontology))[1]
        onto_path = os.path.abspath(ontology)
        onto_url = ""
    else:
        # if not, try to find it in a known locations
        discovered_file = _discover_ontology(ontology)
        if discovered_file:
            onto_name = os.path.split(discovered_file)[1]
            onto_path = discovered_file
            # i know, this sucks
            x = ontology.lower()
            if "http:" in x or "https:" in x or "ftp:" in x or "file:" in x:
                onto_url = ontology
            else:
                onto_url = ""
        else:
            # not found, look into a database (it is last because when bibclassify
            # runs in a standalone mode, it has no database - [rca, old-heritage]
            if not bconfig.STANDALONE:
                result = dbquery.run_sql(
                    "SELECT name, location from clsMETHOD WHERE name LIKE %s",
                    ('%' + ontology + '%', ))
                for onto_short_name, url in result:
                    onto_name = onto_short_name
                    onto_path = _get_remote_ontology(url)
                    onto_url = url

    return (onto_name, onto_path, onto_url)
Exemplo n.º 10
0
def _update_date_of_last_run(runtime):
    """Update bibclassify daemon table information about last run time."""
    run_sql("UPDATE clsMETHOD SET last_updated=%s", (runtime, ))
Exemplo n.º 11
0
def _collection_exists(collection_name):
    """Check if the collection name is registered in the database."""
    if run_sql("SELECT name FROM collection WHERE name=%s",
        (collection_name,)):
        return True
    return False
Exemplo n.º 12
0
def _recid_exists(recid):
    """Check if the recid number is registered in the database."""
    if run_sql("SELECT id FROM bibrec WHERE id=%s",
        (recid,)):
        return True
    return False
Exemplo n.º 13
0
def _update_date_of_last_run(runtime):
    """Update bibclassify daemon table information about last run time."""
    run_sql("UPDATE clsMETHOD SET last_updated=%s", (runtime,))
Exemplo n.º 14
0
def _ontology_exists(ontology_name):
    """Check if the ontology name is registered in the database."""
    if run_sql("SELECT name FROM clsMETHOD WHERE name=%s",
        (ontology_name,)):
        return True
    return False
Exemplo n.º 15
0
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None):
    """Returns an array containing hash objects containing the
    collection, its corresponding ontology and the records belonging to
    the given collection."""
    rec_onts = []

    # User specified record IDs.
    if recids:
        rec_onts.append({
            'ontology': taxonomy,
            'collection': None,
            'recIDs': recids,
        })
        return rec_onts

    # User specified collections.
    if collections:
        for collection in collections:
            records = get_collection_reclist(collection)
            if records:
                rec_onts.append({
                    'ontology': taxonomy,
                    'collection': collection,
                    'recIDs': records
                })
        return rec_onts

    # Use rules found in collection_clsMETHOD.
    result = run_sql("SELECT clsMETHOD.name, clsMETHOD.last_updated, "
        "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON "
        "clsMETHOD.id=id_clsMETHOD JOIN collection ON "
        "id_collection=collection.id")

    for ontology, date_last_run, collection in result:
        records = get_collection_reclist(collection)
        if records:
            if not date_last_run:
                bibtask.write_message("INFO: Collection %s has not been previously "
                    "analyzed." % collection, stream=sys.stderr, verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            elif bibtask.task_get_option('force'):
                bibtask.write_message("INFO: Analysis is forced for collection %s." %
                    collection, stream=sys.stderr, verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            else:
                modified_records = intbitset(run_sql("SELECT id FROM bibrec "
                    "WHERE modification_date >= %s", (date_last_run, )))

            records &= modified_records
            if records:
                rec_onts.append({
                    'ontology': ontology,
                    'collection': collection,
                    'recIDs': records
                })
            else:
                bibtask.write_message("WARNING: All records from collection '%s' have "
                    "already been analyzed for keywords with ontology '%s' "
                    "on %s." % (collection, ontology, date_last_run),
                    stream=sys.stderr, verbose=2)
        else:
            bibtask.write_message("ERROR: Collection '%s' doesn't contain any record. "
                "Cannot analyse keywords." % (collection,),
                stream=sys.stderr, verbose=0)

    return rec_onts
Exemplo n.º 16
0
def _ontology_exists(ontology_name):
    """Check if the ontology name is registered in the database."""
    if run_sql("SELECT name FROM clsMETHOD WHERE name=%s", (ontology_name, )):
        return True
    return False
Exemplo n.º 17
0
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None):
    """Returns an array containing hash objects containing the
    collection, its corresponding ontology and the records belonging to
    the given collection."""
    rec_onts = []

    # User specified record IDs.
    if recids:
        rec_onts.append({
            'ontology': taxonomy,
            'collection': None,
            'recIDs': recids,
        })
        return rec_onts

    # User specified collections.
    if collections:
        for collection in collections:
            records = get_collection_reclist(collection)
            if records:
                rec_onts.append({
                    'ontology': taxonomy,
                    'collection': collection,
                    'recIDs': records
                })
        return rec_onts

    # Use rules found in collection_clsMETHOD.
    result = run_sql(
        "SELECT clsMETHOD.name, clsMETHOD.last_updated, "
        "collection.name FROM clsMETHOD JOIN collection_clsMETHOD ON "
        "clsMETHOD.id=id_clsMETHOD JOIN collection ON "
        "id_collection=collection.id")

    for ontology, date_last_run, collection in result:
        records = get_collection_reclist(collection)
        if records:
            if not date_last_run:
                bibtask.write_message(
                    "INFO: Collection %s has not been previously "
                    "analyzed." % collection,
                    stream=sys.stderr,
                    verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            elif bibtask.task_get_option('force'):
                bibtask.write_message(
                    "INFO: Analysis is forced for collection %s." % collection,
                    stream=sys.stderr,
                    verbose=3)
                modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
            else:
                modified_records = intbitset(
                    run_sql(
                        "SELECT id FROM bibrec "
                        "WHERE modification_date >= %s", (date_last_run, )))

            records &= modified_records
            if records:
                rec_onts.append({
                    'ontology': ontology,
                    'collection': collection,
                    'recIDs': records
                })
            else:
                bibtask.write_message(
                    "WARNING: All records from collection '%s' have "
                    "already been analyzed for keywords with ontology '%s' "
                    "on %s." % (collection, ontology, date_last_run),
                    stream=sys.stderr,
                    verbose=2)
        else:
            bibtask.write_message(
                "ERROR: Collection '%s' doesn't contain any record. "
                "Cannot analyse keywords." % (collection, ),
                stream=sys.stderr,
                verbose=0)

    return rec_onts
Exemplo n.º 18
0
def _recid_exists(recid):
    """Check if the recid number is registered in the database."""
    if run_sql("SELECT id FROM bibrec WHERE id=%s", (recid, )):
        return True
    return False
Exemplo n.º 19
0
def _collection_exists(collection_name):
    """Check if the collection name is registered in the database."""
    if run_sql("SELECT name FROM collection WHERE name=%s",
               (collection_name, )):
        return True
    return False