예제 #1
0
    def perform(self, package: merm_model.PipelinePackage):
        mfst = package.dependencies_dict["factory"].PipelineManifest.manifest

        #breaks corpus into subsets
        grouped_doc_package = mfst["SubsetData"].perform(package)
        if ("ackage" in type(grouped_doc_package).__name__):
            log.getLogger().info("STRUCTURE after SubsetData:" +
                                 grouped_doc_package.structure())
        else:
            log.getLogger().warning(
                "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :("
            )
        log_string = "\n\n_______________________\nPerforming LDA on subsets\n"
        grouped_linked_docs = grouped_doc_package.linked_document_list

        lda_models_by_group = {}
        lda_corpus_by_group = {}
        lda_dict_by_group = {}
        lda_analysis_by_group = {}
        minimum_doc_count = package.dependencies_dict["env"].config.getint(
            'ml_instructions', 'minimum_doc_count')

        dict_for_group_processing = {}
        dict_for_group_processing["grouped_linked_docs"] = grouped_linked_docs
        dict_for_group_processing["lda_models_by_group"] = lda_models_by_group
        dict_for_group_processing["lda_corpus_by_group"] = lda_corpus_by_group
        dict_for_group_processing["lda_dict_by_group"] = lda_dict_by_group
        dict_for_group_processing[
            "lda_analysis_by_group"] = lda_analysis_by_group
        stop_words = package.dependencies_dict[
            "utils"]._stop_word_list_generator(package)

        for sub_corpus_name, doc_list in grouped_linked_docs.items():
            package_one_group = merm_model.PipelinePackage(
                lda_models_by_group, lda_corpus_by_group, lda_dict_by_group,
                grouped_linked_docs[sub_corpus_name], {},
                package.any_inputs_dict, package.dependencies_dict)
            package_one_group.any_analysis_dict["stop_words"] = stop_words
            if len(doc_list) >= minimum_doc_count:
                msg = "\n Subset: " + str(sub_corpus_name) + "\n\n"
                log.getLogger().info(msg)
                self._analyze_subset(package_one_group,
                                     dict_for_group_processing,
                                     str(sub_corpus_name), mfst)
                log_string = log_string + package_one_group.stage_log()

        self._set_analysis(package, lda_analysis_by_group)
        self._set_model(package, lda_models_by_group)

        new_package = merm_model.PipelinePackage(
            package.model, lda_corpus_by_group, lda_dict_by_group,
            package.linked_document_list, package.any_analysis_dict,
            package.any_inputs_dict, package.dependencies_dict)
        new_package.log_stage(log_string)
        return new_package
예제 #2
0
    def perform(self, package: merm_model.PipelinePackage):
        mfst = package.dependencies_dict["factory"].PipelineManifest.manifest

        #breaks corpus into subsets
        grouped_doc_package = mfst["SubsetData"].perform(package)
        if ("ackage" in type(grouped_doc_package).__name__):
            log.getLogger().info("STRUCTURE after SubsetData:" +
                                 grouped_doc_package.structure())
        else:
            log.getLogger().warning(
                "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :("
            )

        grouped_linked_docs = grouped_doc_package.linked_document_list
        analysis_by_group_rake = {}
        analysis_by_group_text_rank = {}
        analysis_by_group_noun_phrase = {}
        minimum_doc_count = package.dependencies_dict["env"].config.getint(
            'ml_instructions', 'minimum_doc_count')
        log_string = "\n======================\nSubset Analysis for text rank, rake and noun phrase.\n"
        for sub_corpus_name_untyped, doc_list in grouped_linked_docs.items():
            sub_corpus_name = str(sub_corpus_name_untyped)
            if len(doc_list) > minimum_doc_count:
                package_one_group = merm_model.PipelinePackage(
                    package.model, package.corpus, package.dict, doc_list, {},
                    package.any_inputs_dict, package.dependencies_dict)
                package_one_group.any_inputs_dict[
                    "corpus_name"] = sub_corpus_name
                package_one_group = self._analyze_subset(
                    package_one_group, sub_corpus_name, mfst, doc_list)
                analysis_by_group_text_rank[
                    sub_corpus_name] = package_one_group.any_analysis_dict[
                        "text_rank_0"]

                log_string = log_string + package_one_group.stage_log()

        package.any_analysis_dict[
            "text_rank_all_groups"] = analysis_by_group_text_rank
        package.any_analysis_dict["rake_all_groups"] = analysis_by_group_rake
        package.any_analysis_dict[
            "noun_phrase_all_groups"] = analysis_by_group_noun_phrase
        new_package = merm_model.PipelinePackage(package.model, package.corpus,
                                                 package.dict,
                                                 grouped_linked_docs,
                                                 package.any_analysis_dict,
                                                 package.any_inputs_dict,
                                                 package.dependencies_dict)

        new_package.log_stage(log_string)
        return new_package
예제 #3
0
    def perform(self, package: merm_model.PipelinePackage):

        linked_doc_by_index = {}
        slackProvider = "slack"
        slack_channels = self._retrieve_slack_channel_names()

        for linked_doc in package.linked_document_list:
            if slackProvider in linked_doc.provider:
                self._process_slack_doc(linked_doc, linked_doc_by_index,
                                        slack_channels)
            else:
                if linked_doc.index_name in linked_doc_by_index:
                    linked_doc_by_index[linked_doc.index_name].append(
                        linked_doc)
                else:
                    groupby_list = []
                    groupby_list.append(linked_doc)
                    linked_doc_by_index[linked_doc.index_name] = groupby_list

        new_package = merm_model.PipelinePackage(package.model, package.corpus,
                                                 package.dict,
                                                 linked_doc_by_index,
                                                 package.any_analysis,
                                                 package.dependencies_dict)
        return new_package
예제 #4
0
    def perform(self, package: merm_model.PipelinePackage):
        lda_topics_by_subset_formatted = package.any_analysis_dict[
            "lda_topics_by_subset_formatted"]
        lda_topics_toplevel_formatted = package.any_analysis_dict[
            "lda_topics_toplevel_formatted"]
        similarity_dict = {}
        for source, topic_dict in lda_topics_toplevel_formatted.items():
            termidx_list = topic_dict["term_indices"]
            weight_list = topic_dict["weights"]
            tuples_list = list(zip(termidx_list, weight_list))
            result = self._similarity_score(lda_topics_by_subset_formatted,
                                            tuples_list)
            term_list = topic_dict["terms"]
            result_dict = {}
            result_dict["terms"] = term_list
            result_dict["spaces"] = result
            similarity_dict[source] = result_dict

        package.any_analysis_dict["similarity_dict"] = similarity_dict
        return merm_model.PipelinePackage(package.model, package.corpus,
                                          package.dict,
                                          package.linked_document_list,
                                          package.any_analysis_dict,
                                          package.any_inputs_dict,
                                          package.dependencies_dict)
예제 #5
0
    def perform(self, package: merm_model.PipelinePackage):
        log.getLogger().info("Analyzing Gensim TF-IDF model")
        log.getLogger().info("Corpus size: " +
                             str(len(package.linked_document_list)))
        self._validate(package)
        idx = 0
        top_tf_idf_corpus = []

        for model_result in package.model[package.corpus]:
            top_tfidf_doc = []
            sorteddoc = sorted(model_result, key=itemgetter(1), reverse=True)
            linked_doc_source = package.linked_document_list[idx]
            for id, freq in sorteddoc[:10]:
                top_tfidf_doc.append(
                    (package.dict[id], np.around(freq, decimals=3)))

            str1 = "\n\n\n"
            log.getLogger().debug(str1)
            top_tf_idf_corpus.append(top_tfidf_doc)

            linked_doc_source.any_analysis = top_tfidf_doc
            idx = idx + 1

        package.any_analysis_dict[
            package.default_analysis_key()] = top_tf_idf_corpus

        return merm_model.PipelinePackage(package.model, package.corpus,
                                          package.dict,
                                          package.linked_document_list,
                                          package.any_analysis_dict,
                                          package.any_inputs_dict,
                                          package.dependencies_dict)
예제 #6
0
    def perform(self, package: data_models.PipelinePackage):
        linked_doc_list = package.linked_document_list
        log.getLogger().info(
            "Converting corpora as bag of words. Input format is List[List[str]]. Output is Gensim Dictionary"
        )
        log.getLogger().info("Corpus size: " +
                             str(len(package.linked_document_list)))
        bowlist = []
        for doc in linked_doc_list:
            bowlist.append(doc.tokens)

        dictionary = corpora.Dictionary(bowlist)

        #log.getLogger().info(dictionary)
        log.getLogger().info("Incoming doc count: " +
                             str(len(linked_doc_list)))
        corpus = [dictionary.doc2bow(line) for line in bowlist]

        log.getLogger().info("Feature count: " + str(len(dictionary.id2token)))
        package.log_stage(
            "Converted the corpus into a Gensim dictionary (i.e., bag of words)"
        )
        return data_models.PipelinePackage(None, corpus, dictionary,
                                           linked_doc_list,
                                           package.any_analysis_dict,
                                           package.any_inputs_dict,
                                           package.dependencies_dict)
예제 #7
0
    def perform(self, package: merm_model.PipelinePackage):
        thetype = type(package.linked_document_list)
        if thetype is dict:
            return package

        include_list = package.dependencies_dict["env"].config[
            "ml_instructions"]["filter_group_include"].split(",")
        exclude_list = package.dependencies_dict["env"].config[
            "ml_instructions"]["filter_group_exclude"].split(",")
        included = self.include_docs(include_list,
                                     package.linked_document_list)
        new_linked_doc_list = self.exclude_list(exclude_list, included)

        new_package = merm_model.PipelinePackage(package.model, package.corpus,
                                                 package.dict,
                                                 new_linked_doc_list,
                                                 package.any_analysis,
                                                 package.any_inputs_dict,
                                                 package.dependencies_dict)

        new_package.log_stage("\nInclude filter was: " + str(include_list) +
                              "\nExclude filter was:" + str(exclude_list) +
                              "\nRemaining documents count: " +
                              str(len(new_linked_doc_list)))
        return new_package
예제 #8
0
    def perform(self, package: merm_model.PipelinePackage):

        lda_topics_by_subset_raw = self.load_topics_by_subset(package, "dict")
        lda_topics_toplevel_raw = self.load_top_level_topics(package, "dict")
        word_to_id = self.build_dict(lda_topics_by_subset_raw,
                                     lda_topics_toplevel_raw)

        lda_topics_by_subset_raw_byrow = self.load_topics_by_subset(
            package, "records")
        lda_topics_toplevel_raw_byrow = self.load_top_level_topics(
            package, "records")

        lda_topics_by_subset_raw_byrow_coded = self.code_terms(
            lda_topics_by_subset_raw_byrow, word_to_id)
        lda_topics_toplevel_raw_byrow_coded = self.code_terms(
            lda_topics_toplevel_raw_byrow, word_to_id)

        lda_topics_by_subset_formatted = self.reformat_data(
            lda_topics_by_subset_raw_byrow_coded)
        lda_topics_toplevel_formatted = self.reformat_data(
            lda_topics_toplevel_raw_byrow_coded)

        package.any_analysis_dict[
            "lda_topics_by_subset_formatted"] = lda_topics_by_subset_formatted
        package.any_analysis_dict[
            "lda_topics_toplevel_formatted"] = lda_topics_toplevel_formatted

        return merm_model.PipelinePackage(package.model, package.corpus,
                                          word_to_id,
                                          package.linked_document_list,
                                          package.any_analysis_dict,
                                          package.dependencies_dict)
    def perform(self, package: merm_model.PipelinePackage):
        classes = {}
        groupby_count = {}
        numeric_class = []
        corpus = []
        category_count = 0
        env = package.dependencies_dict["env"]
        category_field = self._get_category(env)

        for linked_doc in package.linked_document_list:
            corpus.append(linked_doc.raw)
            if category_field == "group_by":
                category = linked_doc.groupedBy
            else:
                category = linked_doc.space

            if category in classes.keys():
                numeric_class.append(classes[category])
                groupby_count[category] = groupby_count[category] + 1
            else:
                classes[category] = category_count
                numeric_class.append(category_count)
                category_count = category_count + 1
                groupby_count[category] = 1

        package.any_analysis_dict["scikit_category_catalog"] = classes

        vectorizer_type = env.config["ml_instructions"]["vectorizer_type"]
        max_features = env.config.getint("ml_instructions", "rf_max_features")

        if "tfidf" in vectorizer_type.lower():
            vectorizer = TfidfVectorizer(analyzer='word',
                                         token_pattern=r'\w{1,}',
                                         stop_words='english',
                                         max_features=max_features)
        else:
            vectorizer = CountVectorizer(analyzer='word',
                                         ngram_range=(1, 1),
                                         min_df=0,
                                         stop_words='english',
                                         max_features=max_features)
        matrix = vectorizer.fit_transform(corpus)
        feature_names = vectorizer.get_feature_names()

        package.any_inputs_dict["SKX"] = matrix
        package.any_inputs_dict["SKY"] = numeric_class
        package.any_inputs_dict["SKdict"] = feature_names
        package.any_inputs_dict["SKcategories"] = classes

        package.log_stage("\nPrepared corpus. \nVectorizor type:" +
                          vectorizer_type + "\nCategory map " +
                          self.class_log(classes) + "\n Groupby map" + "\n\n" +
                          self.groupby_log(groupby_count))
        new_package = merm_model.PipelinePackage(None, (numeric_class, matrix),
                                                 feature_names,
                                                 package.linked_document_list,
                                                 package.any_analysis_dict,
                                                 package.any_inputs_dict,
                                                 package.dependencies_dict)
        return new_package
 def perform(self, package: merm_model.PipelinePackage):
     new_model = self._doLDA(package.corpus)
     new_package = merm_model.PipelinePackage(new_model, package.corpus,
                                              package.dict,
                                              package.linked_document_list,
                                              package.any_analysis_dict,
                                              package.dependencies_dict)
     log.getLogger().info(new_package.structure())
     return new_package
예제 #11
0
    def perform(self, package: merm_model.PipelinePackage):
        log.getLogger().info("Generating Gensim TF-IDF model")

        model = TfidfModel(package.corpus)  # fit model

        return merm_model.PipelinePackage(model, package.corpus, package.dict,
                                          package.linked_document_list,
                                          package.any_analysis_dict,
                                          package.dependencies_dict)
예제 #12
0
    def perform(self, package:merm_model.PipelinePackage):
        #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus)
        log.getLogger().info("STAGE: Seeking to identify similar topics across multiple corpii")
        prepare_data = self._prepare_data(package)
        matching_topics = self._iterate_similar_topics(prepare_data)
        package.any_analysis_dict[package.default_analysis_key()] = matching_topics

        return merm_model.PipelinePackage(package.model,package.corpus,package.dict,
                                          package.linked_document_list,package.any_analysis_dict,
                                          package.any_inputs_dict, package.dependencies_dict)
예제 #13
0
    def perform(self, package:merm_model.PipelinePackage):
        #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus)
        log.getLogger().info("STAGE: Running a standard LDA in Gensim")
        topic_count = env.config.getint('ml_instructions', 'gensim_lda_topics')
        log.getLogger().info("Seeking " + str(topic_count) + " topics")
        report_word_count = env.config.getint('ml_instructions', 'gensim_lda_term_per_topic_reporting_count')
        if len(package.dict.token2id) > 50:
            topic_dict = {}
            topic_dict_friendly = {}
            lda_model = gensim.models.ldamodel.LdaModel(corpus=package.corpus,
                                                        id2word=package.dict,
                                                        num_topics=topic_count,
                                                        update_every=1,
                                                        alpha='auto',
                                                        per_word_topics=False,
                                                        iterations= 100)


            for index, topic in lda_model.show_topics(formatted=False, num_words=report_word_count):
                #print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))
                words_for_topic = []
                words_for_topic_friendly = []
                for w in topic:
                    msg = str(index) + ":" + str(w)
                    log.getLogger().info(msg)
                    words_for_topic.append((w[0],w[1]))
                    words_for_topic_friendly.append(str(w[0]) + "," + str(w[1]))
                topic_dict[index] = words_for_topic
                topic_dict_friendly[index] = words_for_topic_friendly

            package.any_analysis_dict[lda_analysis_key(package)] = topic_dict
            package.any_analysis_dict[lda_analysis_key(package) + "_friendly"] = topic_dict_friendly
            new_package = merm_model.PipelinePackage(lda_model,package.corpus,package.dict,package.linked_document_list,package.any_analysis_dict, package.any_inputs_dict, package.dependencies_dict)
            new_package.log_stage("Performed Gensim LDA.\nTopic Count: " + str(topic_count) + "\nIterations: " + str(100) + \
                                  "\nalpha = 0 \nUpdate Every: 1\n per_word_topics: False\nReporting on top " + str(report_word_count) + "words in each topic\n")
            return new_package
        else:
            new_package = merm_model.PipelinePackage(None, package.corpus, package.dict,
                                                       package.linked_document_list, [], package.any_inputs_dict,
                                                     package.dependencies_dict)
            new_package.log_stage("Gensim LDA aborted. There were too few tokens")
            return new_package
    def perform(self, package: merm_model.PipelinePackage):
        #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus)
        log.getLogger().info("STAGE: Running a standard LDA in Gensim")
        topic_count = env.config.getint('ml_instructions', 'gensim_lda_topics')
        log.getLogger().info("Seeking " + str(topic_count) + " topics")
        report_word_count = env.config.getint(
            'ml_instructions', 'gensim_lda_term_per_topic_reporting_count')
        if len(package.dict.token2id) > 50:
            topic_dict = {}

            lda_model = gensim.models.ldamodel.LdaModel(corpus=package.corpus,
                                                        id2word=package.dict,
                                                        num_topics=topic_count,
                                                        update_every=1,
                                                        alpha='auto',
                                                        per_word_topics=False,
                                                        iterations=100)

            for index, topic in lda_model.show_topics(
                    formatted=False, num_words=report_word_count):
                #print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))
                words_for_topic = []
                for w in topic:
                    msg = str(index) + ":" + str(w)
                    log.getLogger().info(msg)
                    words_for_topic.append((w[0], w[1]))
                topic_dict[index] = words_for_topic

            package.any_analysis_dict[
                package.default_analysis_key()] = topic_dict
            new_package = merm_model.PipelinePackage(
                lda_model, package.corpus, package.dict,
                package.linked_document_list, package.any_analysis_dict,
                package.dependencies_dict)
            return new_package
        else:
            new_package = merm_model.PipelinePackage(
                None, package.corpus, package.dict,
                package.linked_document_list, [], package.dependencies_dict)
            return new_package
예제 #15
0
    def perform(self, package: merm_model.PipelinePackage):
        mfst = package.dependencies_dict["factory"].PipelineManifest.manifest

        #breaks corpus into subsets
        grouped_doc_package = mfst["SubsetData"].perform(package)

        stop_word_applied_linked_docs = []
        grouped_linked_docs = grouped_doc_package.linked_document_list
        log_string = "\n======================\nSubset Stopword removal.\n"
        for sub_corpus_name_untyped, doc_list in grouped_linked_docs.items():
            sub_corpus_name = str(sub_corpus_name_untyped)

            package_one_group: merm_model.PipelinePackage = merm_model.PipelinePackage(
                package.model, package.corpus, package.dict, doc_list, {},
                package.any_inputs_dict, package.dependencies_dict)
            package_one_group.any_inputs_dict["corpus_name"] = sub_corpus_name
            package_one_group = self._analyze_subset(package_one_group,
                                                     sub_corpus_name, mfst,
                                                     doc_list)
            stop_word_applied_linked_docs = stop_word_applied_linked_docs + package_one_group.linked_document_list
            log_string = log_string + package_one_group.stage_log()

        new_package = merm_model.PipelinePackage(
            package.model, package.corpus, package.dict,
            stop_word_applied_linked_docs, package.any_analysis_dict,
            package.any_inputs_dict, package.dependencies_dict)

        new_package.log_stage(log_string)

        if ("ackage" in type(new_package).__name__):
            log.getLogger().info("STRUCTURE after SubsetData:" +
                                 new_package.structure())
        else:
            log.getLogger().warning(
                "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :("
            )

        return new_package
예제 #16
0
def _extract(es, pipe, dependencies_dict: Dict):

    provider = dependencies_dict["env"].config["extract_instructions"][
        "provider"]
    msg = "\n\n\n================\nExtracting from " + str(provider)
    log.getLogger().warning(msg)
    if provider == "all":
        _extract_from_all_providers(es, pipe, dependencies_dict)
    elif provider == "none":
        _enter_pipeline(
            merm_model.PipelinePackage(None, None, None, None, None,
                                       dependencies_dict), pipe)
    else:
        _extract_from_one_provider(es, provider, pipe, dependencies_dict)
예제 #17
0
 def perform(self, package: data_models.PipelinePackage):
     df = package.corpus
     log.getLogger().info(
         "Stage: Converting dataframe of documents (previously mapped through DataFrameConvertForPipeline) to tokenized and lemmatized List[List[str]]. Outer List is corpora, inner list is document as bag of words"
     )
     log.getLogger().info("Corpus size: " + str(df.shape))
     corpora_list = self._dfToList(package)
     token_list = package.dependencies_dict["utils"].tokenize(corpora_list)
     merm_tools_linkeddocument_list = package.dependencies_dict[
         "utils"].lemmatize_tokens(
             token_list,
             package.dependencies_dict["utils"].standard_stop_words())
     package = data_models.PipelinePackage(None, None, None,
                                           merm_tools_linkeddocument_list,
                                           package.any_analysis_dict,
                                           package.dependencies_dict)
     return package
    def perform(self, package: merm_model.PipelinePackage):
        corpus = []
        for linked_doc in package.linked_document_list:
            corpus.append(linked_doc.raw)

        vectorizer = CountVectorizer(analyzer='word',
                                     ngram_range=(1, 1),
                                     min_df=0,
                                     stop_words='english')
        matrix = vectorizer.fit_transform(corpus)
        feature_names = vectorizer.get_feature_names()

        new_package = merm_model.PipelinePackage(None, matrix, feature_names,
                                                 package.linked_document_list,
                                                 package.any_analysis_dict,
                                                 package.dependencies_dict)
        return new_package
예제 #19
0
def _extract_from_one_provider(es, provider, pipe, dependencies_dict: Dict):
    msg = "\n\n-------------------------\nPROVIDER: " + str(
        provider) + "\n---------------------\n\n"
    log.getLogger().warning(msg)
    ignore_indices = dependencies_dict["env"].config["extract_instructions"][
        "ignore_indices"]
    ignore_indices_list = ignore_indices.split(",")
    indices = es_conn.retrieve_index_registry()

    limit = _dev_limit(dependencies_dict)
    count = 0

    df_per_space_list: List[DataFrame] = []
    for index_name in indices:
        if "@" in index_name:
            continue
        if index_name in ignore_indices_list:
            continue
        #log.getLogger().info("Retrieved " + str(count) + " rows.")
        if count > limit:
            break
        if provider in index_name:
            df = _retrieve_index_content(es, index_name, provider)
            if not df.empty:
                #log.getLogger().debug("Retrieved " + index_name + ": row count " + str(df.shape))
                count = count + df.shape[0]
                df_per_space_list.append(df)

    if len(df_per_space_list) > 0:
        complete_corpus_df = pd.concat(df_per_space_list, ignore_index=True)
        if True == _dev_bool(dependencies_dict):
            complete_corpus_df = complete_corpus_df.head(limit)
        #log.getLogger().info("\n\nExtraction Complete. Document count = " + str(complete_corpus_df[:5]))
        log.getLogger().info("complete_corpus_df shape: " +
                             str(complete_corpus_df.shape))
        dfu.col_names(df, "complete_corpus_df")
        msg = "\n\n>>>>>>>>>>>>>>   Entering Pipeline For  " + str(
            provider) + ">>>>>>>>>>\n\n"
        log.getLogger().info(msg)
        analysis_dict = {}
        analysis_dict["provider"] = provider
        _enter_pipeline(
            merm_model.PipelinePackage(None, complete_corpus_df, None, None,
                                       analysis_dict, dependencies_dict), pipe)
예제 #20
0
    def perform(self, package: merm_model.PipelinePackage):
        mfst = package.dependencies_dict["factory"].PipelineManifest.manifest

        #breaks corpus into subsets
        grouped_doc_package = mfst["GroupByESIndex"].perform(package)
        if ("ackage" in type(grouped_doc_package).__name__):
            log.getLogger().info("STRUCTURE after GroupByESIndex:" +
                                 grouped_doc_package.structure())
        else:
            log.getLogger().warning(
                "The return type is not of type PipelinePackage. THIS IS BAD PRACTICE :("
            )

        grouped_linked_docs = grouped_doc_package.linked_document_list

        lda_models_by_group = {}
        lda_corpus_by_group = {}
        lda_dict_by_group = {}
        lda_analysis_by_group = {}

        dict_for_group_processing = {}
        dict_for_group_processing["grouped_linked_docs"] = grouped_linked_docs
        dict_for_group_processing["lda_models_by_group"] = lda_models_by_group
        dict_for_group_processing["lda_corpus_by_group"] = lda_corpus_by_group
        dict_for_group_processing["lda_dict_by_group"] = lda_dict_by_group
        dict_for_group_processing[
            "lda_analysis_by_group"] = lda_analysis_by_group

        for sub_corpus_name, doc_list in grouped_linked_docs.items():
            if len(doc_list) > 100:
                self._analyze_subset(grouped_doc_package,
                                     dict_for_group_processing,
                                     grouped_doc_package.any_analysis_dict,
                                     sub_corpus_name, mfst, doc_list)

        package.any_analysis_dict[
            package.default_analysis_key()] = lda_analysis_by_group
        new_package = merm_model.PipelinePackage(lda_models_by_group,
                                                 lda_corpus_by_group,
                                                 lda_dict_by_group,
                                                 grouped_linked_docs,
                                                 package.any_analysis_dict,
                                                 package.dependencies_dict)
        return new_package
예제 #21
0
    def perform(self, package:merm_model.PipelinePackage):
        #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus)
        log.getLogger().info("STAGE: Seeking to reduce topics to those specified in input flatfile")
        csv = package.dependencies_dict["env"].config["local_data"]["confluence_lda_bysubset"]
        df = pd.read_csv(csv)
        df.dropna(inplace=True)
        reduced_topics = df.to_dict(orient="records")
        prepared_reduced_topics = self._prepare_reduced_topics(reduced_topics)

        prepare_data = self._prepare_data(package)
        matching_topics = self._iterate_similar_topics(prepare_data, prepared_reduced_topics)
        package.any_analysis_dict[package.default_analysis_key()] = matching_topics
        return merm_model.PipelinePackage(package.model,
                                          package.corpus,
                                          package.dict,
                                          package.linked_document_list,
                                          package.any_analysis_dict,
                                          package.any_inputs_dict,
                                          package.dependencies_dict)
예제 #22
0
 def perform(self, package: data_models.PipelinePackage):
     df = package.corpus
     log.getLogger().info(
         "Stage: Converting dataframe of documents (previously mapped through DataFrameConvertForPipeline) to tokenized and lemmatized List[List[str]]. Outer List is corpora, inner list is document as bag of words"
     )
     log.getLogger().info("Corpus size: " + str(df.shape))
     corpora_list = self._dfToList(package)
     tokenized_linked_docs = package.dependencies_dict["utils"].tokenize(
         corpora_list)
     #merm_tools_linkeddocument_list =package.dependencies_dict["utils"].lemmatize_tokens(token_list, package.dependencies_dict["utils"].standard_stop_words())
     package = data_models.PipelinePackage(None, None, None,
                                           tokenized_linked_docs,
                                           package.any_analysis_dict,
                                           package.any_inputs_dict,
                                           package.dependencies_dict)
     category_group_tuple = data_models.category_group_tuple(
         package.any_analysis_dict["provider"])
     package.log_stage(
         "Converted a pandas dataframe into our own document list format. \nDocument count is "
         + str(len(tokenized_linked_docs)) + ".\n Category is " +
         category_group_tuple[0] + "\n GroupBy " + category_group_tuple[1])
     return package
예제 #23
0
    def perform(self, package:merm_model.PipelinePackage):
        #scipy_csc_matrix = gensim.matutils.corpus2csc(package.corpus)
        log.getLogger().info("STAGE: Running a standard LDA in Gensim")
        topic_count = env.config.getint('ml_instructions', 'gensim_lda_topics')
        permitted_overlap = env.config.getint('ml_instructions', 'gensim_lda_permitted_term_overlap_across_topics')


        log.getLogger().info("Seeking " + str(topic_count) + " topics")
        report_word_count = env.config.getint('ml_instructions', 'gensim_lda_term_per_topic_reporting_count')
        if len(package.dict.token2id) > 50:


            new_package = self._run_lda(topic_count,report_word_count, permitted_overlap, package)
            new_package.log_stage("Performed Gensim LDA.\nTopic Count: " + str(topic_count) + "\nIterations: " + str(100) + \
                                  "\nalpha = 0 \nUpdate Every: 1\n per_word_topics: False\nReporting on top " + str(report_word_count) + "words in each topic\n")
            return new_package
        else:
            new_package = merm_model.PipelinePackage(None, package.corpus, package.dict,
                                                       package.linked_document_list, [], package.any_inputs_dict,
                                                     package.dependencies_dict)
            new_package.log_stage("Gensim LDA aborted. There were too few tokens")
            return new_package
예제 #24
0
    def perform(self, package: merm_model.PipelinePackage):
        thetype = type(package.linked_document_list)
        if thetype is dict:
            return package
        env = package.dependencies_dict["env"]
        by_space = env.config.getboolean("ml_instructions", "subset_by_space")

        if by_space == True:
            linked_doc_by_index = self._by_space(package)
        else:
            linked_doc_by_index = self._by_group(package)

        new_package = merm_model.PipelinePackage(package.model, package.corpus,
                                                 package.dict,
                                                 linked_doc_by_index,
                                                 package.any_analysis,
                                                 package.any_inputs_dict,
                                                 package.dependencies_dict)

        new_package.log_stage(
            "Divided the entire corpus into groups. The groups created are " +
            str(linked_doc_by_index.keys()))
        return new_package
예제 #25
0
def initiate_run():
    try:
        log.getLogger().info(env.printEnvironment())
        env.init()
        log.getLogger().info(env.printConf())
        continue_run = True

        dependencies_dict = {}
        dependencies_dict["env"] = env
        dependencies_dict["factory"] = factory
        dependencies_dict["es_extract"] = es_extract
        dependencies_dict["pipe_process"] = pipe_process
        dependencies_dict["utils"] = utils
        dependencies_dict["dfutils"] = dfutils
        dependencies_dict["colutils"] = colutils
        dependencies_dict["log"] = log
        dependencies_dict["es_conn"] = es_conn
        dependencies_dict["ingestor"] = ingestor
        dependencies_dict["syntax"] = syntax

        log.getLogger().info("Dependencies: ")
        for k, v in dependencies_dict.items():
            log.getLogger().info(str(k) + " : " + str(v))
        while continue_run == True:
            package = merm_model.PipelinePackage(None, None, None, None, {}, {}, dependencies_dict)
            package.any_analysis_dict["stage_log"] = ""
            pipeline.run_pipeline(package)

            continue_run = env.continue_run()
            if(not env.run_forever()):
                break
        log.getLogger().info("#################### Run Completed :) #################### ")

    except Exception as e:
        msg = str(e)
        log.getLogger().error(env.print_traceback())
        log.getLogger().error(msg)
예제 #26
0
    def _analyze_subset(self, grouped_doc_package, dict_for_group_processing,
                        any_analysis_dict, sub_corpus_name, manifest,
                        doc_list):
        package_one_group = merm_model.PipelinePackage(
            grouped_doc_package.model, grouped_doc_package.corpus,
            grouped_doc_package.dict, doc_list, any_analysis_dict,
            grouped_doc_package.dependencies_dict)

        package_one_group = manifest["StopWordRemoval"].perform(
            package_one_group)
        package_one_group = manifest["ListOfListsToGensimCorpora"].perform(
            package_one_group)
        package_one_group = manifest["GensimLDA"].perform(package_one_group)

        dict_for_group_processing["lda_models_by_group"][
            sub_corpus_name] = package_one_group.model
        dict_for_group_processing["lda_corpus_by_group"][
            sub_corpus_name] = package_one_group.corpus
        dict_for_group_processing["lda_dict_by_group"][
            sub_corpus_name] = package_one_group.dict
        dict_for_group_processing["lda_analysis_by_group"][
            sub_corpus_name] = package_one_group.any_analysis_dict
        overlap_dict = self._topic_overlap(
            dict_for_group_processing["lda_analysis_by_group"]
            [sub_corpus_name])
        stop_list = self._dynamic_stop_words(
            overlap_dict, grouped_doc_package.dependencies_dict)
        if len(stop_list) > 4:
            msg = "\n\n=============\nWill try again while removing " + str(
                stop_list) + " from " + sub_corpus_name
            log.getLogger().info(msg)
            any_analysis_dict["stop_words"] = stop_list
            package_one_group = self._analyze_subset(
                grouped_doc_package, dict_for_group_processing,
                any_analysis_dict, sub_corpus_name, manifest, doc_list)
        return package_one_group