Пример #1
0
def analyse_one_page_generic( env, pos, file_str, logger, page_to_dict, add_to_index ):
    """

    """
    global shared_variable
    try:
        all_start = time.time()
        if pos > env.count or shared_variable[0] > 0:
            return

        if os.path.exists( env["parallel"]["wait_file"] ) or \
                os.path.exists( env["parallel"]["wait_file"] + "_" + env.get_value("dataset", "") ):
            logger.warning( u"Waiting because wait file is here..." )
            while os.path.exists( env["parallel"]["wait_file"] ):
                time.sleep( env["parallel"]["wait_sleep"] )

        logger.warning( u"Processing [%s][%s]", pos, file_str )

        # which command to use for conversion
        with codecs.open(file_str, mode="r", encoding="utf-8") as fin:
            html = fin.read()

        if env.verbose:
            logger.warning(u"Adding [%s] [%s]", pos, file_str)
        s = time.time()
        meta_dict = page_to_dict(html, env, pos, file_str)
        _logger_suspicious.debug( u"page_to_dict lasted [%s]", time.time() - s )
        del html

        if not u"id" in meta_dict:
            logger.error(u"Invalid page: could not find element [id]... %s", file_str)
            return

        if env.verbose > 0:
            dict_str = u""
            for k in meta_dict.keys():
                if not k.startswith(u"text"):
                    dict_str += u"%s: %s\n" % (k, meta_dict[k])
            logger.info(dict_str)

        if len(meta_dict.get("math", "")) == 0:
            logger.warning(u"No math inside this page! [%s][%s]", meta_dict["id"], meta_dict["title"])
            return
        num_formulas = len(meta_dict["math"])

        # index only
        if len( env.get_value( "index_only_fields", [] ) ) > 0:
            only_fields = env["index_only_fields"]
            for k in meta_dict.keys():
                if not k in only_fields:
                    meta_dict[k] = None
        # index except only
        if len( env.get_value( "not_index_only_fields", [] ) ) > 0:
            not_fields = env["not_index_only_fields"]
            for k in not_fields:
                meta_dict[k] = None

        # index it
        s = time.time()
        added = add_to_index(env, meta_dict, file_str, logger)
        _logger_suspicious.debug( u"Adding lasted [%s]", time.time() - s )
        del meta_dict

        if added and env["indexer"].get( "continue", False ):
            fname = env["indexer"]["continue"]
            s = time.time()
            with codecs.open( fname, encoding="utf-8", mode="a+" ) as fout:
                fout.write( u"%s\n" % file_str )
            _logger_suspicious.debug( u"Continue writing lasted [%s]", time.time() - s )

        took = time.time() - all_start
        logger.warning( u"Processed [%s][formulas:%s][%s][lasted:%s]",
                        pos, num_formulas, os.path.basename(file_str), took )
        return

    except SystemExit, e:
        logger.exception(u"Thread SystemExit exception in %s:" % file_str)
        utils.log_stacktrace_to_file(env["exception_file"], e)
        raise
Пример #2
0
        # index it
        s = time.time()
        added = add_to_index(env, meta_dict, file_str, logger)
        _logger_suspicious.debug( u"Adding lasted [%s]", time.time() - s )
        del meta_dict

        if added and env["indexer"].get( "continue", False ):
            fname = env["indexer"]["continue"]
            s = time.time()
            with codecs.open( fname, encoding="utf-8", mode="a+" ) as fout:
                fout.write( u"%s\n" % file_str )
            _logger_suspicious.debug( u"Continue writing lasted [%s]", time.time() - s )

        took = time.time() - all_start
        logger.warning( u"Processed [%s][formulas:%s][%s][lasted:%s]",
                        pos, num_formulas, os.path.basename(file_str), took )
        return

    except SystemExit, e:
        logger.exception(u"Thread SystemExit exception in %s:" % file_str)
        utils.log_stacktrace_to_file(env["exception_file"], e)
        raise
    except KeyboardInterrupt:
        logger.critical(u"Got keyboard exception - exiting!")
        shared_variable[0] = 1
    except Exception, e:
        logger.exception(u"Thread exception in %s:" % file_str)
        utils.log_stacktrace_to_file(env["exception_file"], e)
        if env.debug:
            raise
Пример #3
0
            return lambda x: _print_with_options(dir(module))

    # what to do but really?
    return what_to_do


#================
# main
#================

if __name__ == "__main__":
    check_system()

    lasted = time.time()
    env = utils.environment()
    env.update(settings)

    try:
        what_to_do = parse_command_line(env)
        what_to_do(env)
    except SystemExit:
        raise
    except Exception, e:
        lasted = time.time() - lasted
        logger.exception("An exception occurred, ouch:\n%s", str(e))
        utils.log_stacktrace_to_file(settings["exception_file"], e)

    finally:
        tt = time.time() - lasted
        logger.warn("Stopping after [%f] secs (%f minutes).", tt, tt / 60.)
Пример #4
0
            return lambda x: _print_with_options(dir(module))

    # what to do but really?
    return what_to_do


#================
# main
#================

if __name__ == "__main__":
    check_system()

    lasted = time.time()
    env = utils.environment()
    env.update(settings)

    try:
        what_to_do = parse_command_line(env)
        what_to_do(env)
    except SystemExit:
        raise
    except Exception, e:
        lasted = time.time() - lasted
        logger.exception("An exception occurred, ouch:\n%s", str(e))
        utils.log_stacktrace_to_file(settings["exception_file"], e)

    finally:
        tt = time.time() - lasted
        logger.warn("Stopping after [%f] secs (%f minutes).", tt, tt / 60.)
Пример #5
0
def analyse_one_page_generic(env, pos, file_str, logger, page_to_dict,
                             add_to_index):
    """

    """
    global shared_variable
    try:
        all_start = time.time()
        if pos > env.count or shared_variable[0] > 0:
            return

        if os.path.exists( env["parallel"]["wait_file"] ) or \
                os.path.exists( env["parallel"]["wait_file"] + "_" + env.get_value("dataset", "") ):
            logger.warning(u"Waiting because wait file is here...")
            while os.path.exists(env["parallel"]["wait_file"]):
                time.sleep(env["parallel"]["wait_sleep"])

        logger.warning(u"Processing [%s][%s]", pos, file_str)

        # which command to use for conversion
        with codecs.open(file_str, mode="r", encoding="utf-8") as fin:
            html = fin.read()

        if env.verbose:
            logger.warning(u"Adding [%s] [%s]", pos, file_str)
        s = time.time()
        meta_dict = page_to_dict(html, env, pos, file_str)
        _logger_suspicious.debug(u"page_to_dict lasted [%s]", time.time() - s)
        del html

        if not u"id" in meta_dict:
            logger.error(u"Invalid page: could not find element [id]... %s",
                         file_str)
            return

        if env.verbose > 0:
            dict_str = u""
            for k in meta_dict.keys():
                if not k.startswith(u"text"):
                    dict_str += u"%s: %s\n" % (k, meta_dict[k])
            logger.info(dict_str)

        if len(meta_dict.get("math", "")) == 0:
            logger.warning(u"No math inside this page! [%s][%s]",
                           meta_dict["id"], meta_dict["title"])
            return
        num_formulas = len(meta_dict["math"])

        # index only
        if len(env.get_value("index_only_fields", [])) > 0:
            only_fields = env["index_only_fields"]
            for k in meta_dict.keys():
                if not k in only_fields:
                    meta_dict[k] = None
        # index except only
        if len(env.get_value("not_index_only_fields", [])) > 0:
            not_fields = env["not_index_only_fields"]
            for k in not_fields:
                meta_dict[k] = None

        # index it
        s = time.time()
        added = add_to_index(env, meta_dict, file_str, logger)
        _logger_suspicious.debug(u"Adding lasted [%s]", time.time() - s)
        del meta_dict

        if added and env["indexer"].get("continue", False):
            fname = env["indexer"]["continue"]
            s = time.time()
            with codecs.open(fname, encoding="utf-8", mode="a+") as fout:
                fout.write(u"%s\n" % file_str)
            _logger_suspicious.debug(u"Continue writing lasted [%s]",
                                     time.time() - s)

        took = time.time() - all_start
        logger.warning(u"Processed [%s][formulas:%s][%s][lasted:%s]", pos,
                       num_formulas, os.path.basename(file_str), took)
        return

    except SystemExit, e:
        logger.exception(u"Thread SystemExit exception in %s:" % file_str)
        utils.log_stacktrace_to_file(env["exception_file"], e)
        raise
Пример #6
0
        s = time.time()
        added = add_to_index(env, meta_dict, file_str, logger)
        _logger_suspicious.debug(u"Adding lasted [%s]", time.time() - s)
        del meta_dict

        if added and env["indexer"].get("continue", False):
            fname = env["indexer"]["continue"]
            s = time.time()
            with codecs.open(fname, encoding="utf-8", mode="a+") as fout:
                fout.write(u"%s\n" % file_str)
            _logger_suspicious.debug(u"Continue writing lasted [%s]",
                                     time.time() - s)

        took = time.time() - all_start
        logger.warning(u"Processed [%s][formulas:%s][%s][lasted:%s]", pos,
                       num_formulas, os.path.basename(file_str), took)
        return

    except SystemExit, e:
        logger.exception(u"Thread SystemExit exception in %s:" % file_str)
        utils.log_stacktrace_to_file(env["exception_file"], e)
        raise
    except KeyboardInterrupt:
        logger.critical(u"Got keyboard exception - exiting!")
        shared_variable[0] = 1
    except Exception, e:
        logger.exception(u"Thread exception in %s:" % file_str)
        utils.log_stacktrace_to_file(env["exception_file"], e)
        if env.debug:
            raise