Exemplo n.º 1
0
def run_pipeline(package: merm_model.PipelinePackage):
    log.getLogger().warning("------- STARTING PIPELINE -------")

    #create factory
    factory = package.dependencies_dict["pipe_process"].PipelineFactory()

    # specify steps
    pipeline_steps = pick_pipeline()
    log.getLogger().info(str(pipeline_steps))

    pipeline_steps.sort(key=lambda tup: tup[0])

    # ...and we're off to the races :)
    for step_tuple in pipeline_steps:
        if env.continue_run() == True:
            package = factory.next_step(step_tuple[1], package)
        else:
            log.getLogger().warning("Continue run is FALSE")

    log.getLogger().info("------- PIPELINE COMPLETED -------")

    # Post pipeline; This is where the data is no longer changing. Rather, the data is ready
    # for functional application.
    log.getLogger().warning("------- POST PROCESS APPLICATION -------")
    if env.continue_run() == True:
        post_process.triage(package)
def _iterate_corpus(package: merm_model.PipelinePackage):
    count = 0
    try:
        for linked_doc in package.linked_document_list:
            if env.continue_run() == True:
                #for each doc in the corpus
                #split the raw text into sentences. 1 LinkedDocument per sentence
                log.getLogger().debug(str(env.continue_run()))
                doc_by_sentence_list = _split_linked_doc_by_sentence(linked_doc)
                #tokenize and lemmatize the sentences
                doc_uid = linked_doc.uid
                doc_url = linked_doc.ui

                lemmatized_sentences = _lemmatize_sentences(doc_by_sentence_list)
                if len(lemmatized_sentences) > 2000:
                    lemmatized_sentences = lemmatized_sentences[:2000]
                #startmsg = "\n\n" + doc_uid + " | " + linked_doc.ui + " | length: " + str(len(lemmatized_sentences)) + "\n\n"
                #log.getLogger().info(startmsg)

                salient_corpus_map =_generate_partof_docs(package, lemmatized_sentences, doc_uid, doc_url)
                endmsg = "\n\n" + str(count) + ": Dispatching " + str(len(salient_corpus_map)) + " parts from " +doc_url+ ".\n\n"
                log.getLogger().debug(endmsg)
                _generate_json_and_dispatch(salient_corpus_map, 0)
                count = count + 1
                if count % 300 == 0:
                    log.getLogger().info("running " + str(count))
    except Exception as e:
        msg = "\n\nERROR: " +  str(e)
        log.getLogger().error(msg)
Exemplo n.º 3
0
def initiate_run():
    try:
        log.getLogger().info(env.printEnvironment())
        env.init()
        log.getLogger().info(env.printConf())
        continue_run = True

        dependencies_dict = {}
        dependencies_dict["env"] = env
        dependencies_dict["factory"] = factory
        dependencies_dict["es_extract"] = es_extract
        dependencies_dict["pipe_process"] = pipe_process
        dependencies_dict["utils"] = utils
        dependencies_dict["dfutils"] = dfutils

        while continue_run == True:
            es_extract.initiate_extraction(pipeline.run_pipeline,
                                           dependencies_dict)
            continue_run = env.continue_run()
            if (not env.run_forever()):
                break
        log.getLogger().info(
            "#################### Run Completed :) #################### ")

    except Exception as e:
        msg = str(e)
        log.getLogger().error(env.print_traceback())
        log.getLogger().error(msg)
def run_post_process(package: merm_model.PipelinePackage):
    if env.continue_run():

        tfidf_top_terms: List[List[Tuple[str, float]]] = package.any_analysis()
        _validate_corpus(tfidf_top_terms, package.linked_document_list)
        _create_spaces()
        log.getLogger().info("Corpus size: " + str(len(package.linked_document_list)))
        _iterate_corpus(package)
Exemplo n.º 5
0
def step_through(package: merm_model.PipelinePackage, pipeline_steps,
                 log_string):

    factory = package.dependencies_dict["pipe_process"].PipelineFactory()
    for step_tuple in pipeline_steps:
        start_time = time.time()
        if env.continue_run() == True:
            package = factory.next_step(step_tuple[1], package)
            end_time = time.time() - start_time
            log.getLogger().info("Time to complete: " + str(end_time))
            log_string = log_string + "\n\n------------\n\n" + step_tuple[
                1] + "\n\n" + package.stage_log() + "\nTime: " + str(end_time)
        else:
            log.getLogger().warning("Continue run is FALSE")
        package.log_stage(log_string)
    return package
Exemplo n.º 6
0
def run_pipeline(package: merm_model.PipelinePackage):
    log.getLogger().warning("------- STARTING PIPELINE -------")
    env = package.dependencies_dict["env"]

    report_dir = env.config["job_instructions"]["output_folder"]
    provider = env.config["extract_instructions"]["provider"]
    pipeline_name = env.config["pipeline_instructions"]["pipeline_name"]
    queryvalue = env.config["extract_instructions"]["query_value"]
    dt = datetime.now()
    suffix = str(dt.microsecond)[-4:]

    file_name = package.dependencies_dict["utils"].clean_string_for_tokenizing(
        provider + "_" + pipeline_name + "_" + queryvalue + "_" +
        suffix).replace(" ", "_") + ".txt"

    #create factory

    # specify steps
    pipeline_steps = pick_pipeline()
    log.getLogger().info(str(pipeline_steps))

    pipeline_steps.sort(key=lambda tup: tup[0])
    log_string = ""

    # ...and we're off to the races :)
    package = step_through(package, pipeline_steps, log_string)
    if "current_loop" in package.any_inputs_dict.keys():
        current_loop = package.any_inputs_dict["current_loop"]
        while current_loop < package.any_inputs_dict["loop_count"]:
            current_loop = package.any_inputs_dict["current_loop"]
            package = step_through(package, pipeline_steps,
                                   package.stage_log())

    env.overwrite_file(report_dir + "/" + file_name, package.stage_log())
    log.getLogger().info("------- PIPELINE COMPLETED -------")

    # Post pipeline; This is where the data is no longer changing. Rather, the data is ready
    # for functional application.
    log.getLogger().warning("------- POST PROCESS APPLICATION -------")
    if env.continue_run() == True:
        post_process.triage(package)
Exemplo n.º 7
0
def initiate_run():
    try:
        log.getLogger().info(env.printEnvironment())
        env.init()
        log.getLogger().info(env.printConf())
        continue_run = True

        dependencies_dict = {}
        dependencies_dict["env"] = env
        dependencies_dict["factory"] = factory
        dependencies_dict["es_extract"] = es_extract
        dependencies_dict["pipe_process"] = pipe_process
        dependencies_dict["utils"] = utils
        dependencies_dict["dfutils"] = dfutils
        dependencies_dict["colutils"] = colutils
        dependencies_dict["log"] = log
        dependencies_dict["es_conn"] = es_conn
        dependencies_dict["ingestor"] = ingestor
        dependencies_dict["syntax"] = syntax

        log.getLogger().info("Dependencies: ")
        for k, v in dependencies_dict.items():
            log.getLogger().info(str(k) + " : " + str(v))
        while continue_run == True:
            package = merm_model.PipelinePackage(None, None, None, None, {}, {}, dependencies_dict)
            package.any_analysis_dict["stage_log"] = ""
            pipeline.run_pipeline(package)

            continue_run = env.continue_run()
            if(not env.run_forever()):
                break
        log.getLogger().info("#################### Run Completed :) #################### ")

    except Exception as e:
        msg = str(e)
        log.getLogger().error(env.print_traceback())
        log.getLogger().error(msg)