def _write_data_files_to_db(data_dir_name):
    '''
    Reads all the files of a specified directory and writes the content
    to the memory cache and from there to the database.

    @param data_dir_name: Directory where to look for the files
    @type data_dir_name: string
    '''

    if data_dir_name.endswith("/"):
        data_dir_name = data_dir_name[0:-1]

    if not data_dir_name:
        bibtask.write_message("Data directory not specified. Task failed.",
                              stream=sys.stdout, verbose=0)
        return False

    if not osp.isdir(data_dir_name):
        bibtask.write_message("Specified Data directory is not a directory. "
                              "Task failed.",
                              stream=sys.stdout, verbose=0)
        return False

    job_dirs = os.listdir(data_dir_name)

    total = len(job_dirs)
    status = 0

    for job_dir in job_dirs:
        status += 1
        job_dir = "%s/%s" % (data_dir_name, job_dir)

        if not osp.isdir(job_dir):
            bibtask.write_message("This is not a directory and therefore "
                                  "skipped: %s." % job_dir,
                              stream=sys.stdout, verbose=0)
            continue

        results_dir = "%s/results/" % (job_dir,)

        if not osp.isdir(results_dir):
            bibtask.write_message("No result set found in %s"
                                  % (results_dir,), stream=sys.stdout,
                                  verbose=0)
            continue

        log_name = osp.abspath(job_dir).split("/")
        logfile = "%s/%s.log" % (job_dir, log_name[-1])
        logfile_lastline = ""

        if not osp.isfile(logfile):
            bibtask.write_message("No log file found in %s" % (job_dir,),
                                  stream=sys.stdout, verbose=0)
            continue

        try:
            logfile_lastline = tail(logfile)
        except IOError:
            logfile_lastline = ""

        if logfile_lastline.count("Finish! The computation finished in") < 1:
            bibtask.write_message("Log file indicates broken results for %s"
                                  % (job_dir,), stream=sys.stdout, verbose=0)
            continue

        correct_files = set(['realauthors.dat',
                             'ids.dat',
                             'virtual_author_clusters.dat',
                             'virtual_authors.dat',
                             'doclist.dat',
                             'virtual_author_data.dat',
                             'authornames.dat',
                             'virtual_author_cluster_cache.dat',
                             'realauthor_data.dat',
                             'ra_va_cache.dat']
                            )
        result_files = os.listdir(results_dir)

        if not correct_files.issubset(set(result_files)):
            bibtask.write_message("Reults folder does not hold the "
                                  "correct files: %s" % (results_dir,),
                                  stream=sys.stdout, verbose=0)
            continue

        bibtask.task_update_progress('Loading job %s of %s: %s'
                                     % (status, total, log_name[-1]))

        if (populate_structs_from_files(results_dir, results=True) and
            write_mem_cache_to_tables(sanity_checks=True)):
            bibtask.write_message("All Done.",
                                  stream=sys.stdout, verbose=0)
        else:
            bibtask.write_message("Could not write data to the tables from %s"
                                  % (results_dir,),
                                  stream=sys.stdout, verbose=0)
def main():
    """Main function """
    arguments = sys.argv

    if len(arguments) <= 1:
        bconfig.LOGGER.error("Please provide parameters!")
        _display_help()

    run_daemon = True
    standalone_option = ("-S", "--standalone", "-j", "--job-dir")

    for option in standalone_option:
        for arg in arguments:
            if arg.startswith(option):
                run_daemon = False

    if run_daemon:
        daemon = None
        try:
            import bibauthorid_daemon as daemon
        except ImportError:
            bconfig.LOGGER.error("Hmm...No Daemon process running.")

        if daemon:
            daemon.bibauthorid_daemon()
    else:
        options = _read_options(arguments)

        if options["job_dir"]:
            job_dir = options["job_dir"]

            if job_dir.endswith("/"):
                job_dir = job_dir[0:-1]

            log_name = osp.abspath(job_dir).split("/")
            logfile = "%s/%s.log" % (job_dir, log_name[-1])

            start = time.time()

            bconfig.init_logger(logfile)
            populate_structs_from_files(job_dir)

            bconfig.LOGGER.debug("| Loaded %s records."
                                 % len(dat.RELEVANT_RECORDS))

            engine.start_computation(process_doclist=True,
                             process_orphans=True,
                             print_stats=True)

            result_path = "%s/results/" % (job_dir,)

            if make_directory(result_path):
                write_mem_cache_to_files(result_path, is_result=True)
            else:
                bconfig.LOGGER.error("Cannot write to destination: "
                                     "Cannot create directory")

            end = time.time() - start

            bconfig.LOGGER.log(25, "Finish! The computation finished in %.2fs"
                               % (end))
            bconfig.stop_and_close_logger()
        else:
            bconfig.LOGGER.error("Standalone mode without parameters "
                                 "does not do anything helpful. Please"
                                 "consult -h help message for usage")
def _write_data_files_to_db(data_dir_name):
    '''
    Reads all the files of a specified directory and writes the content
    to the memory cache and from there to the database.

    @param data_dir_name: Directory where to look for the files
    @type data_dir_name: string
    '''

    if data_dir_name.endswith("/"):
        data_dir_name = data_dir_name[0:-1]

    if not data_dir_name:
        bibtask.write_message("Data directory not specified. Task failed.",
                              stream=sys.stdout,
                              verbose=0)
        return False

    if not osp.isdir(data_dir_name):
        bibtask.write_message(
            "Specified Data directory is not a directory. "
            "Task failed.",
            stream=sys.stdout,
            verbose=0)
        return False

    job_dirs = os.listdir(data_dir_name)

    total = len(job_dirs)
    status = 0

    for job_dir in job_dirs:
        status += 1
        job_dir = "%s/%s" % (data_dir_name, job_dir)

        if not osp.isdir(job_dir):
            bibtask.write_message("This is not a directory and therefore "
                                  "skipped: %s." % job_dir,
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        results_dir = "%s/results/" % (job_dir, )

        if not osp.isdir(results_dir):
            bibtask.write_message("No result set found in %s" %
                                  (results_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        log_name = osp.abspath(job_dir).split("/")
        logfile = "%s/%s.log" % (job_dir, log_name[-1])
        logfile_lastline = ""

        if not osp.isfile(logfile):
            bibtask.write_message("No log file found in %s" % (job_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        try:
            logfile_lastline = tail(logfile)
        except IOError:
            logfile_lastline = ""

        if logfile_lastline.count("Finish! The computation finished in") < 1:
            bibtask.write_message("Log file indicates broken results for %s" %
                                  (job_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        correct_files = set([
            'realauthors.dat', 'ids.dat', 'virtual_author_clusters.dat',
            'virtual_authors.dat', 'doclist.dat', 'virtual_author_data.dat',
            'authornames.dat', 'virtual_author_cluster_cache.dat',
            'realauthor_data.dat', 'ra_va_cache.dat'
        ])
        result_files = os.listdir(results_dir)

        if not correct_files.issubset(set(result_files)):
            bibtask.write_message("Reults folder does not hold the "
                                  "correct files: %s" % (results_dir, ),
                                  stream=sys.stdout,
                                  verbose=0)
            continue

        bibtask.task_update_progress('Loading job %s of %s: %s' %
                                     (status, total, log_name[-1]))

        if (populate_structs_from_files(results_dir, results=True)
                and write_mem_cache_to_tables(sanity_checks=True)):
            bibtask.write_message("All Done.", stream=sys.stdout, verbose=0)
        else:
            bibtask.write_message(
                "Could not write data to the tables from %s" % (results_dir, ),
                stream=sys.stdout,
                verbose=0)
示例#4
0
def main():
    """Main function """
    arguments = sys.argv

    if len(arguments) <= 1:
        bconfig.LOGGER.error("Please provide parameters!")
        _display_help()

    run_daemon = True
    standalone_option = ("-S", "--standalone", "-j", "--job-dir")

    for option in standalone_option:
        for arg in arguments:
            if arg.startswith(option):
                run_daemon = False

    if run_daemon:
        daemon = None
        try:
            import bibauthorid_daemon as daemon
        except ImportError:
            bconfig.LOGGER.error("Hmm...No Daemon process running.")

        if daemon:
            daemon.bibauthorid_daemon()
    else:
        options = _read_options(arguments)

        if options["job_dir"]:
            job_dir = options["job_dir"]

            if job_dir.endswith("/"):
                job_dir = job_dir[0:-1]

            log_name = osp.abspath(job_dir).split("/")
            logfile = "%s/%s.log" % (job_dir, log_name[-1])

            start = time.time()

            bconfig.init_logger(logfile)
            populate_structs_from_files(job_dir)

            bconfig.LOGGER.debug("| Loaded %s records." %
                                 len(dat.RELEVANT_RECORDS))

            engine.start_computation(process_doclist=True,
                                     process_orphans=True,
                                     print_stats=True)

            result_path = "%s/results/" % (job_dir, )

            if make_directory(result_path):
                write_mem_cache_to_files(result_path, is_result=True)
            else:
                bconfig.LOGGER.error("Cannot write to destination: "
                                     "Cannot create directory")

            end = time.time() - start

            bconfig.LOGGER.log(
                25, "Finish! The computation finished in %.2fs" % (end))
            bconfig.stop_and_close_logger()
        else:
            bconfig.LOGGER.error("Standalone mode without parameters "
                                 "does not do anything helpful. Please"
                                 "consult -h help message for usage")