Пример #1
0
def initiate_run():
    try:
        log.getLogger().info(env.printEnvironment())
        env.init()
        log.getLogger().info(env.printConf())
        continue_run = True

        dependencies_dict = {}
        dependencies_dict["env"] = env
        dependencies_dict["factory"] = factory
        dependencies_dict["es_extract"] = es_extract
        dependencies_dict["pipe_process"] = pipe_process
        dependencies_dict["utils"] = utils
        dependencies_dict["dfutils"] = dfutils

        while continue_run == True:
            es_extract.initiate_extraction(pipeline.run_pipeline,
                                           dependencies_dict)
            continue_run = env.continue_run()
            if (not env.run_forever()):
                break
        log.getLogger().info(
            "#################### Run Completed :) #################### ")

    except Exception as e:
        msg = str(e)
        log.getLogger().error(env.print_traceback())
        log.getLogger().error(msg)
Пример #2
0
    def perform(self, package: merm_model.PipelinePackage):
        if ("job" not in package.any_analysis_dict["provider"]):
            raise Exception("This classs will not work on " +
                            str(package.any_analysis_dict["provider"]))
        df = package.corpus
        log.getLogger().info("Shape of DF: " + str(df.shape))
        jobs_dict = {}

        for index, row in df.iterrows():
            majorFinal = row["majorFinal"]
            if majorFinal is None:
                jobs_string = row["jobFinal"]

                if jobs_string in jobs_dict.keys():
                    jobs_dict[jobs_string] = jobs_dict[jobs_string] + 1
                else:
                    jobs_dict[jobs_string] = 1
        package.any_analysis_dict["no_major_jobs_count"] = jobs_dict

        for index, row in df.iterrows():
            jobs_string = row["jobFinal"]

            if jobs_string in jobs_dict.keys():
                jobs_dict[jobs_string] = jobs_dict[jobs_string] + 1
            else:
                jobs_dict[jobs_string] = 1
        package.any_analysis_dict["jobs_count"] = jobs_dict

        return package
Пример #3
0
    def _run_lda(self, topic_count, report_word_count, permitted_overlap, package:merm_model.PipelinePackage):
        topic_dict = {}
        topic_dict_friendly = {}
        lda_model = gensim.models.ldamodel.LdaModel(corpus=package.corpus,
                                                    id2word=package.dict,
                                                    num_topics=topic_count,
                                                    update_every=1,
                                                    alpha='auto',
                                                    per_word_topics=False,
                                                    iterations=100)

        topics = lda_model.show_topics(formatted=False, num_words=report_word_count)
        for index, topic in topics:
            # print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))
            words_for_topic = []
            words_for_topic_friendly = []
            for w in topic:
                words_for_topic.append((w[0], w[1]))
                words_for_topic_friendly.append(str(w[0]) + "," + str(w[1]))
            topic_dict[index] = words_for_topic
            topic_dict_friendly[index] = words_for_topic_friendly

        topic_overlap = self._topic_overlap(topic_dict)
        log.getLogger().info(str(topic_overlap))
        stop_words = self._dynamic_stop_words(topic_overlap, permitted_overlap)
        if len(stop_words) > permitted_overlap:
            log.getLogger().info("\n**********\nRerunning LDA after removing " + str(len(stop_words)) + " words")
            package = self._remove_stop_words(stop_words,package)
            package = self._rebuild_corpus(package)
            return self._run_lda(topic_count,report_word_count,permitted_overlap,package)
        package.any_analysis_dict[lda_analysis_key(package) + "_topic_overlap"] = topic_overlap
        package.any_analysis_dict[lda_analysis_key(package)] = topic_dict
        package.any_analysis_dict[lda_analysis_key(package) + "_friendly"] = topic_dict_friendly
        return package
Пример #4
0
    def perform(self, package:data_models.PipelinePackage):
        doc_list = []
        term_dict = {}
        for linked_doc in package.linked_document_list:
            doc_list.append(linked_doc.tokens)
            term_dict[linked_doc.any_inputs["terms"]] = 1

        model = gensim.models.Word2Vec(
            doc_list,
            size=100,
            window=10,
            min_count=2,
            workers=5,
            iter=10)


        for terms in list(term_dict.keys()):
            term_list = terms.split(" ")
            for term in term_list:
                if term in list(model.wv.index2entity):

                    result = model.wv.most_similar(positive=term)
                    output = "\n_____ " + term + " _____\n"
                    for rel in result:
                        output = output + rel[0] + "\t" + str(rel[1]) + "\n"

                    output = output + "\n - - -\n"
                    log.getLogger().info(output)
        return package
Пример #5
0
    def perform(self, package: merm_model.PipelinePackage):
        if ("job" not in package.any_analysis_dict["provider"]):
            raise Exception("This classs will not work on " +
                            str(package.any_analysis_dict["provider"]))
        df = package.corpus
        log.getLogger().info("Shape of DF: " + str(df.shape))
        areas_of_study_dict_undefined = {}

        for index, row in df.iterrows():
            majorFinal = row["majorFinal"]
            if majorFinal is None:
                areas_of_study = row["areasOfStudy"]
                if len(areas_of_study) > 0:
                    areasOfStudyList = areas_of_study.split(",")
                    for s in areasOfStudyList:
                        if s in areas_of_study_dict_undefined.keys():
                            areas_of_study_dict_undefined[
                                s] = areas_of_study_dict_undefined[s] + 1
                        else:
                            areas_of_study_dict_undefined[s] = 1
        package.any_analysis_dict[
            "undefined_areas_of_study_count"] = areas_of_study_dict_undefined

        areas_of_study_dict = {}
        for index, row in df.iterrows():
            majorFinal = row["majorFinal"]

            if majorFinal in areas_of_study_dict.keys():
                areas_of_study_dict[
                    majorFinal] = areas_of_study_dict[majorFinal] + 1
            else:
                areas_of_study_dict[majorFinal] = 1
        package.any_analysis_dict["areas_of_study_count"] = areas_of_study_dict

        return package
Пример #6
0
def retrieve_index_registry():
    es = connectToES()
    results = es.indices.get('*')
    indices = results.keys()
    for key in indices:
        log.getLogger().info("%d spaces found" + str(key))
    return indices
Пример #7
0
def run_post_process(package: merm_model.PipelinePackage):
    log.getLogger().info("save text rank results to file")
    path = env.config["job_instructions"]["output_folder"]
    text_rank_results = package.any_analysis_dict["text_rank_all_groups"]
    text_rank_overall = package.any_analysis_dict["text_rank_0"]
    # count = 0
    # for key in text_rank_results:
    #
    #     analysis = text_rank_results[key]
    #     if "ict" in  type(analysis).__name__:
    #         file_name = path +"/" + "TextRank_" + str(key) + ".csv"
    #         log.getLogger().info("Saving "+ file_name)
    #         with open(file_name, 'w') as f:
    #             for k in analysis.keys():
    #                 for sentence in analysis[k]:
    #                     count = count + 1
    #                     f.write("%s,%s,%s\n" % (k, sentence[0], sentence[1]))
    toes = env.config.getboolean("job_instructions", "output_to_elasticsearch")

    if True == toes:
        _reset_index(package)
        _dispatch_to_elastic_search_all_groups(
            text_rank_results, package.any_analysis_dict["provider"])
        _dispatch_to_elastic_search(text_rank_overall,
                                    package.any_analysis_dict["provider"])
Пример #8
0
def _generate_json_and_dispatch(linked_doc:merm_model.LinkedDocument):
    es = es_conn.connectToES()
    index_name = linked_doc.index_name
    log.getLogger().debug("Dispatching: " + str(linked_doc.uid) + " | " + index_name)

    result = es.update(index=index_name, doc_type='_doc', id=linked_doc.uid, body=_generate_json())
    log.getLogger().debug("Dispatched with result " + str(result))
Пример #9
0
 def perform(self, package: merm_model.PipelinePackage):
     package.corpus.to_csv(
         env.config['job_instructions']['es_file_location'], index=False)
     log.getLogger().info(
         "Saved ElasticSearch Data as CSV at: " +
         env.config['job_instructions']['es_file_location'])
     return package
Пример #10
0
def run_post_process(package: merm_model.PipelinePackage):
    log.getLogger().info("rake post process")

    keywords_dict = package.any_analysis_dict["rake"]

    sorted_keywords_dict = _sortKeywords(keywords_dict)
    _saveToFile(sorted_keywords_dict)
Пример #11
0
    def perform(self, package: merm_model.PipelinePackage):

        df = package.corpus
        log.getLogger().info("Shape of DF: " + str(df.shape))
        groupby_dict = {}
        column = package.dependencies_dict["env"].config["ml_instructions"][
            "df_groupby_column"]

        count = 0
        for index, row in df.iterrows():
            count = count + 1
            if count % 1000 == 0:
                sys.stdout.write(".")
            jobs_string = row[column]

            if jobs_string in groupby_dict.keys():
                groupby_dict[jobs_string] = groupby_dict[jobs_string] + 1
            else:
                groupby_dict[jobs_string] = 1
        package.log_stage(
            "Broke a pandas data frame into a dict of data grouped by " +
            str(column))
        package.any_analysis_dict["group_by_" + column] = groupby_dict

        return package
def run_post_process(package: merm_model.PipelinePackage):
    log.getLogger().info("run_post_process")
    csv_list_of_lists = []
    csv_list_of_lists.append(["index_name", "topic_id", "term", "weight"])
    report_sentences = env.config.getboolean(
        'ml_instructions', 'gensim_lda_report_sentence_level')
    for idxname, topicdict in package.any_analysis().items():
        report_for_index = "\n\n\n+++++++++++++++++++\n\nReport for " + idxname + "\n\n"
        docs_list = package.linked_document_list[idxname]
        if report_sentences == True:
            corpus_as_sentences = break_corpus_as_sentences(docs_list)
        report_for_index += "Corpus Size: " + str(len(docs_list)) + "\n"
        if len(docs_list) > 100:
            for topicid, topiclist in topicdict["default_analysis_key"].items(
            ):
                report_for_index += "\n\nTOPIC:" + str(topicid) + "\n"

                for entry in topiclist:
                    report_for_index += str(entry[0])
                    report_for_index += "\t\t\t"
                    report_for_index += str(entry[1])
                    report_for_index += "\n"
                    csv_list_of_lists.append(
                        [idxname, topicid, entry[0], entry[1]])
                if report_sentences == True:
                    salient_sentences = find_salient_sentences(
                        topiclist, corpus_as_sentences)
                    report_for_index += "\n\nSALIENT_SENTENCES\n"
                    for sentence in salient_sentences:
                        report_for_index += sentence + "\n"

            log.getReportLogger().info(report_for_index)
    _save_topic_model(package)
    _save_csv(csv_list_of_lists, "lda_analysis_by_subset")
Пример #13
0
    def perform(self, package: merm_model.PipelinePackage):
        utils = package.dependencies_dict["utils"]
        colutils = package.dependencies_dict["colutils"]
        env = package.dependencies_dict["env"]
        embeddings_file = env.config["ml_instructions"][
            "text_rank_embeddings_file"]
        dimensions = env.config.getint("ml_instructions", "glove_dimensions")

        word_embeddings_list = self._word_embeddings(embeddings_file)

        #sentences = package.dependencies_dict["utils"].corpus_as_sentence_list(package)
        tokenized_sentences_by_doc = utils.corpus_as_tokenized_sentence_linked_doc_list_grouped_by_doc(
            package, True)
        log.getLogger().info("we have " +
                             str(len(tokenized_sentences_by_doc)) + " docs")
        rank_by_dict = self._prep_rank_by_doc_dict(package)
        count = 0
        for docid, sentences in tokenized_sentences_by_doc.items():
            sentence_by_rank_dict = self.rank_by_document(
                sentences, word_embeddings_list, package, dimensions)
            for key, value in sentence_by_rank_dict.items():
                sentence_list_for_that_rank = rank_by_dict[key]
                sentence_list_for_that_rank.append([dimensions, docid, value])
            if count % 100 == 0:
                print(count)
            count = count + 1
        analysis_key = colutils.incrementing_key("text_rank",
                                                 package.any_analysis_dict)
        package.any_analysis_dict[analysis_key] = rank_by_dict
        package.log_stage("Conducting text rank. Total document count is " + str(len(package.linked_document_list)) + \
                          ". For each document the top " + str(len(list(rank_by_dict.keys()))) + " ranked sentences were captured." + \
                          "\nGlove dimension count: " + str(dimensions))
        return package
Пример #14
0
    def perform(self, package: merm_model.PipelinePackage):
        analysis_id = self._analysis_id(package)
        log.getLogger().info("K means prediciting. Tea time")
        X = package.any_inputs_dict["SKX"]
        env = package.dependencies_dict["env"]
        test_range = env.config["ml_instructions"] ["silhouette_range"].split(",")
        reporting_count = env.config.getint("ml_instructions","sklearn_kmeans_term_per_cluster_reporting_count")

        Xarray = X.toarray()
        silhouette_results = _silhouette(Xarray,test_range)
        cluster_count_tuple = max(silhouette_results, key=lambda x:x[1])

        skdict = package.any_inputs_dict["SKdict"]
        kmeans = KMeans(n_clusters=cluster_count_tuple[0], random_state=10)
        kmeans.fit_predict(Xarray)

        centers = kmeans.cluster_centers_.argsort()[:, ::-1]

        centroid_list = []
        centroid_list.append(["cluster","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16"])
        for i in range(cluster_count_tuple[0]):
            row_list = [i]
            for ind in centers[i, :reporting_count]:
                row_list.append(skdict[ind])

            centroid_list.append(row_list)


        cluster_list = []
        cluster_list.append(["cluster","sentence"])

        package.any_analysis_dict[analysis_id + "_top_terms"] = centroid_list
        package.any_inputs_dict["kmeans_top_terms_key"] = analysis_id + "_top_terms"
        package.log_stage("Kmeans Clustering, no repeats\nSilhouette : " + str(silhouette_results) + "\nCluster count : " + str(cluster_count_tuple))
        return package
Пример #15
0
def pick_pipeline():
    pipeline_name = env.config["pipeline_instructions"]["pipeline_name"]
    log.getLogger().info(pipeline_name)

    if pipeline_name == "gensim_lda":
        return _gensim_lda_steps
    elif pipeline_name == "gensim_lda_by_subset":
        return _gensim_lda_by_subset_steps
    elif pipeline_name == "sklearn_lda":
        return _sklearn_lda_steps
    elif pipeline_name == "lda_topic_comparator":
        return _lda_topic_comparator_steps
    elif pipeline_name == 'save_as_csv':
        return _save_as_csv
    elif pipeline_name == '_job_integrity_analysis':
        return _job_integrity_analysis
    elif pipeline_name == '_group_by_column':
        return _group_by_column
    elif pipeline_name == '_rake':
        return _rake
    else:
        log.getLogger().warning(
            str(pipeline_name) +
            " is invalid. Please configure tools.ini and create a relevant list of steps within this script"
        )
        return []
Пример #16
0
def run_pipeline(package: merm_model.PipelinePackage):
    log.getLogger().warning("------- STARTING PIPELINE -------")

    #create factory
    factory = package.dependencies_dict["pipe_process"].PipelineFactory()

    # specify steps
    pipeline_steps = pick_pipeline()
    log.getLogger().info(str(pipeline_steps))

    pipeline_steps.sort(key=lambda tup: tup[0])

    # ...and we're off to the races :)
    for step_tuple in pipeline_steps:
        if env.continue_run() == True:
            package = factory.next_step(step_tuple[1], package)
        else:
            log.getLogger().warning("Continue run is FALSE")

    log.getLogger().info("------- PIPELINE COMPLETED -------")

    # Post pipeline; This is where the data is no longer changing. Rather, the data is ready
    # for functional application.
    log.getLogger().warning("------- POST PROCESS APPLICATION -------")
    if env.continue_run() == True:
        post_process.triage(package)
Пример #17
0
def lemmatize_tokens(corpora_list: List[merm_model.LinkedDocument],
                     stop_words: List[str]):
    nlp = spacy.load('en_core_web_sm')
    stoplist = stop_words
    lemmatized_corpus = []
    iter_count = 0
    lemmatizer = WordNetLemmatizer()
    # log.getLogger().info("Lemmatizing corpus. This can be slow.")
    for doc in corpora_list:
        lemmatized_text = []
        for word in doc.tokens:
            # print("word: " + word)
            lemmatized_word = lemmatizer.lemmatize(word)
            if lemmatized_word is not None:
                cleanword = text_utils.clean_string_for_tokenizing(
                    lemmatized_word)
                if cleanword not in stoplist and len(
                        cleanword) > 1 and not text_utils.hasNumbers(
                            cleanword):
                    # print(cleanword)
                    lemmatized_text.append(cleanword)
        doc.tokens = lemmatized_text
        lemmatized_corpus.append(doc)
        iter_count += 1

        if env.test_env(
        ) == True and iter_count > env.test_env_doc_processing_count():
            log.getLogger().info("DEV MODE: Breaking loop here")
            break
    return lemmatized_corpus
Пример #18
0
def col_names(df, df_name=""):
    colNames = df.columns.values
    cnstr = ""
    for cn in colNames:
        cnstr = str(cnstr) + "\n" + str(cn)

    log.getLogger().info(df_name + " Column Names: " + cnstr + "\n")
Пример #19
0
def create_and_register_index(index_name:str, body_json):
    try:
        es = connectToES()
        es.indices.create(index=index_name, body=body_json)
    except Exception as e:
        s = str(e)
        log.getLogger().error("Could not create index. " + s)
Пример #20
0
def run_post_process(package: merm_model.PipelinePackage):
    log.getLogger().info("run_post_process: Gensim LDA Report")
    report_string = ""
    report_sentences = env.config.getboolean(
        'ml_instructions', 'gensim_lda_report_sentence_level')
    csv_list_of_lists = []
    csv_list_of_lists.append(["index_name", "topic_id", "term", "weight"])

    for topicid, topiclist in package.any_analysis().items():
        report_string += "\n\nTOPIC:" + str(topicid) + "\n"
        if report_sentences == True:
            corpus_as_sentences = break_corpus_as_sentences(
                package.linked_document_list)

        for entry in topiclist:
            report_string += str(entry[0])
            report_string += "\t\t\t"
            report_string += str(entry[1])
            report_string += "\n"
            csv_list_of_lists.append([
                package.any_analysis_dict["provider"], topicid, entry[0],
                entry[1]
            ])
            if report_sentences == True:
                salient_sentences = find_salient_sentences(
                    topiclist, corpus_as_sentences)
                report_string += "\n\nSALIENT_SENTENCES\n"
                for sentence in salient_sentences:
                    report_string += sentence + "\n"

        log.getReportLogger().info(report_string)
    _save_topic_model(package)
    _save_csv(csv_list_of_lists, "lda_topics_toplevel")
Пример #21
0
    def perform(self, package: merm_model.PipelinePackage):
        analysis_id = self._analysis_id(package)
        log.getLogger().info("K means prediciting. Tea time")
        X = package.any_inputs_dict["SKX"]
        env = package.dependencies_dict["env"]
        test_range = env.config["ml_instructions"] ["silhouette_range"].split(",")

        Xarray = X.toarray()
        silhouette_results = _silhouette(Xarray,test_range)
        cluster_count_tuple = max(silhouette_results, key=lambda x:x[1])
        y = package.any_inputs_dict["SKY"]
        skdict = package.any_inputs_dict["SKdict"]
        cluster = AgglomerativeClustering(n_clusters=cluster_count_tuple[0], affinity='euclidean', linkage='ward')

        result = cluster.fit_predict(X.toarray())
        labels = cluster.labels_
        cluster_list = []
        for j in range(labels.shape[0]):
            row_list = []
            sentence = package.linked_document_list[j].raw
            cluster = labels[j]
            row_list.append(cluster)
            row_list.append(sentence)
            cluster_list.append(row_list)
        cluster_list

        package.any_analysis_dict[analysis_id+"_result"] = cluster_list
        package.log_stage("Agglomerative Clustering\nSilhouette : " + str(silhouette_results) + "\nCluster count : " + str(cluster_count_tuple))
        return package
Пример #22
0
def delete_index(index_name):
    try:
        es = connectToES()
        es.indices.delete(index=index_name, ignore=[400, 404])
    except Exception as e:
        msg = "WARN: " +  str(e)
        log.getLogger().error(msg)
Пример #23
0
def _process_major_final(package):
    aggregated_majors_path = env.config["local_data"][
        "aggregated_majors_filepath"]
    json1_file = open(aggregated_majors_path)
    json1_str = json1_file.read()
    aggregated_majors_dict = json.loads(json1_str)

    for index, row in package.corpus.iterrows():
        majorFinal = row["majorFinal"]

        if majorFinal is None:
            areas_of_study = row["areasOfStudy"]
            if len(areas_of_study) > 0:
                areasOfStudyList = areas_of_study.split(",")
                for s in areasOfStudyList:
                    supper = s.upper()
                    if supper in aggregated_majors_dict and majorFinal is None:
                        major_final_from_file = aggregated_majors_dict[supper]
                        package.corpus.loc[
                            index, "majorFinal"] = major_final_from_file
                        log.getLogger().info(major_final_from_file)
                        log.getLogger().info(
                            "added to  df: " +
                            str(package.corpus.loc[index, "majorFinal"]))
                        majorFinal = major_final_from_file
                        doc_id = row["id"]
                        _generate_json_and_dispatch(doc_id, row["indexname"],
                                                    major_final_from_file)
Пример #24
0
def triage(package: merm_model.PipelinePackage):
    instructions = env.config["pipeline_instructions"]["post_process"]

    instruction_list = instructions.split(",")
    for instruction in instruction_list:

        if instruction == "tfidf_partof_sentence_breakout":
            tfidf_breakout.run_post_process(package)
        elif instruction == "page_views_confluence":
            page_view_update.run_post_process(package)
        elif instruction == "gensim_lda_report_by_subset":
            gensim_lda_report_by_subset.run_post_process(package)
        elif instruction == "gensim_lda_report":
            gensim_lda_report.run_post_process(package)
        elif instruction == "tfidf_log_text_detector":
            log_detector.run_post_process(package)
        elif instruction == "gensim_lda_report_topic_similarity":
            gensim_similarity_report.run_post_process(package)
        elif instruction == "save_dictionaries_to_file":
            save_dictionaries_to_file.run_post_process(package)

        elif instruction == "major_analysis":
            major_analysis.run_post_process(package)
        elif instruction == "rake":
            rake.run_post_process(package)
        elif instruction == "none":
            log.getLogger().info("Nothing to do. No post-process assigned.")
Пример #25
0
    def _do_glove(self, package, cooccurrence_dict, dimensions, alpha, x_max,
                  vocab):
        glove_start = time.time()
        model = glove.Glove(cooccurrence_dict,
                            d=dimensions,
                            alpha=alpha,
                            x_max=x_max)
        glove_time = (time.time() - glove_start)
        log.getLogger().info("glove_time  " + str(glove_time))
        glove_train_start = time.time()
        model.train(batch_size=200, workers=9)
        glove_train_time = (time.time() - glove_train_start)
        log.getLogger().info("glove_train_time  " + str(glove_train_time))
        glove_list = self.output_format(model.W, vocab)
        glove_output_key = str(dimensions) + "d_" + str(x_max) + "_" + str(
            alpha) + "_glove_output"

        if "glove_output_key" in package.any_inputs_dict.keys():
            package.any_inputs_dict[
                "glove_output_key"] = package.any_inputs_dict[
                    "glove_output_key"] + "," + glove_output_key
        else:
            package.any_inputs_dict["glove_output_key"] = glove_output_key

        package.any_analysis_dict[glove_output_key] = glove_list
        package.any_analysis_dict["gl0ve_vocab"] = vocab
def _generate_json_and_dispatch(salient_corpus_map:Dict, retry_count=0):
    try:

        es = es_conn.connectToES()
        total_sentences=0
        for key, value in salient_corpus_map.items():
            sentence_list =  value[1]
            docid = key
            total_sentences = total_sentences + len(sentence_list)
            if len(sentence_list) > 0:
                linked_doc = _extract_linked_doc_from_list(value[1])
                index_name = linked_doc.index_name


                log.getLogger().debug("Dispatching: " + str(docid) + " | " + index_name)
                es.index(index=index_name + index_suffix, doc_type='_doc', id=key, body=_generate_json(linked_doc, _convert_linkeddoclist_to_string(sentence_list)))
    except Exception as e:
        retry_count = retry_count + 1
        msg = "WARN: " +  str(e)

        log.getLogger().error(msg)
        if "time" in msg.lower() and retry_count < 10:
            _generate_json_and_dispatch(salient_corpus_map, retry_count)
        else:
            pass
Пример #27
0
def delete_index(index_name):
    try:
        es_conn.delete_index(index_name)
        time.sleep(5)
    except Exception as e:
        msg = "WARN: " + str(e)
        log.getLogger().error(msg)
        pass
Пример #28
0
def _extract_from_all_providers(es, pipe, dependencies_dict):
    providers = dependencies_dict["env"].config["extract_instructions"][
        "all_providers"]
    providers_list = providers.split(",")
    log.getLogger().debug("Extracting from all providers: " +
                          str(providers_list))
    for provider in providers_list:
        _extract_from_one_provider(es, provider, pipe, dependencies_dict)
def run_post_process(package: merm_model.PipelinePackage):
    if env.continue_run():

        tfidf_top_terms: List[List[Tuple[str, float]]] = package.any_analysis()
        _validate_corpus(tfidf_top_terms, package.linked_document_list)
        _create_spaces()
        log.getLogger().info("Corpus size: " + str(len(package.linked_document_list)))
        _iterate_corpus(package)
def _validate_corpus(tfidf_top_terms: List[List[Tuple[str, float]]], linked_doc_list: List[merm_model.LinkedDocument]):
    docidx = 0
    for terms in tfidf_top_terms:
        linked_doc = linked_doc_list[docidx]
        for word, freq in terms:
            if word not in linked_doc.tokens:
                log.getLogger().error("NOT FOUND " + word)
                raise Exception("NOT FOUND " + word + ". NLP corpus out of sync with source corpus")
        docidx = docidx + 1