def getRelationDetailByHDP(sentence_list):
    # 聚类获取结果
    corpus = []
    pairs_all, position_all = segmentor.segListWithNerTag(sentence_list)
    words_list = []
    for pairs in pairs_all:
        word_list = []
        for pair in pairs:
            if pair.flag.__contains__("v") or pair.flag.__contains__("n"):
                word_list.append(pair.word)
        words_list.append(word_list)
    # words_list = list(map(lambda pairs: map(lambda x: x.word, pairs), pairs_all))
    from gensim import corpora
    dictionary = corpora.Dictionary(words_list)
    for words in words_list:
        corpus.append(dictionary.doc2bow(words))
    from gensim.models import HdpModel
    hdp = HdpModel(corpus, dictionary)
    a = hdp.print_topics()
    words = {}
    for topic in a:
        word_details = str(topic[1]).split(" + ")
        for word_detail in word_details:
            word = str(word_detail[word_detail.index("*") + 1:])
            num = float(str(word_detail[:word_detail.index("*")]))
            if not (words.__contains__(word)):
                words[word] = num
            else:
                words[word] += num
    words = sorted(words.items(), key=lambda d: d[1])
    return words  # 后获取句法分析中的高频动词名词)
示例#2
0
    def fit(self, df_original, topics):
        #Create Dictionary
        self.dictionary = self._create_dictionary(df_original)

        #Create corpus
        self.corpus = self._create_corpus(df_original)

        #Train Model
        hdp = HdpModel(self.corpus, id2word=self.dictionary, T=topics)
        self.model = hdp.suggested_lda_model()
        feature_vecs = []
        for i in range(len(self.corpus)):
            top_topics = self.model.get_document_topics(
                self.corpus[i], minimum_probability=0.0)
            topic_vec = [0] * topics
            for j in top_topics:
                index = j[0]
                topic_vec[index] = j[1]
            feature_vecs.append(topic_vec)

        df_lda_reduced = pd.DataFrame(feature_vecs,
                                      columns=list(range(len(
                                          feature_vecs[0]))))
        df_lda_reduced.insert(0, 'Name', list(df_original['Name'].values),
                              False)
        df_lda_reduced = df_lda_reduced.sort_values(by=['Name'])
        return df_lda_reduced
示例#3
0
def hierarchical_dirichlet_process(corpus, num_topics, id2word):
    ''' HIERARCHICAL DIRICHLET PROCESS
    # Advantage of HDP: fully unsupervised: can determine the ideal number of topics it needs through posterior inference
    '''
    print 'Hierarchical Dirichlet Process'
    hdp_model = HdpModel(corpus = corpus, id2word = id2word)
    hdp_model.show_topics()
    hdp_topic = hdp_model.show_topics(formatted = False)
    return hdp_model
示例#4
0
def train_hdp_model(corpus, dictionary, chunksize):
    print('HDP model')
    model = HdpModel(corpus=corpus, id2word=dictionary, chunksize=chunksize, random_state=config.SEED)
    # To get the topic words from the model
    topics = []
    for topic_id, topic in model.show_topics(num_topics=10, formatted=False):
        topic = [word for word, _ in topic]
        topics.append(topic)
    return model
示例#5
0
    def topicsHDP(self, num_topics=-1, topn=20):
        # HdpModel(corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None)
        hdp = HdpModel(corpus=self.corpus, id2word=self.id2word)

        # show_topics(topics=20, topn=20, log=False, formatted=True)
        # Print the topN most probable words for topics number of topics. Set topics=-1 to print all topics.
        # Set formatted=True to return the topics as a list of strings, or False as lists of (weight, word) pairs.

        return hdp.show_topics(topics=num_topics, topn=topn, formatted=False)
    def topicsHDP(self, num_topics=-1, topn=20):
        # HdpModel(corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None)
        hdp = HdpModel(corpus=self.corpus, id2word=self.id2word)

        # show_topics(topics=20, topn=20, log=False, formatted=True)
        # Print the topN most probable words for topics number of topics. Set topics=-1 to print all topics.
        # Set formatted=True to return the topics as a list of strings, or False as lists of (weight, word) pairs.

        return hdp.show_topics(topics=num_topics, topn=topn, formatted=False)
示例#7
0
def create_hdp(num_topic, dictionary):
    print("__________________________Create HDP_________________________")
    corpus, dic = generate_corpus(dictionary)
    hdpmodel = HdpModel(corpus=corpus, id2word=dic)
    topics = hdpmodel.print_topics(num_topics=num_topic, num_words=7)
    # see list of topics
    for topic in topics:
        print(topic)

    return hdpmodel
示例#8
0
    def get_topics(self, corpus, vocabulary, num_words=10):

        hdpmodel = HdpModel(corpus=corpus, id2word=vocabulary)
        # Docs say that if -1 all topics will be in result (ordered by significance). num_words is optional.
        # .print_topics(num_topics=20, num_words=10)
        # Docs are wrong. If you use -1 the list will be empty. So just don't specify the num_topics:
        topics = hdpmodel.show_topics(formatted=False,
                                      num_words=num_words,
                                      num_topics=-1)
        #print(hdpmodel.get_topics().shape)
        return topics
示例#9
0
def train_topics(args):
    print(f"Arguments: {args}")

    nlp = spacy.load("en", disable=["parser", "ner"])

    files = args["text"]
    lines = extract_stories(files)

    def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        allowed_postags = set(allowed_postags)
        docs = nlp.pipe(texts)
        text_tokens = []
        for doc in docs:
            tokens = [
                token.lemma_ for token in doc if token.pos_ in allowed_postags
                and not token.is_punct and not token.is_stop
            ]
            text_tokens.append(tokens)
        return text_tokens

    docs = tozenize(lines, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    print("Preprocessed Docs")

    bigram = gensim.models.Phrases(docs, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[docs], threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    docs = make_bigrams(docs)
    docs = make_trigrams(docs)

    print("Create Dictionary")
    # Create Dictionary
    corpus_dict = corpora.Dictionary(docs)
    # Create Corpus
    texts = docs
    # Term Document Frequency
    corpus = [corpus_dict.doc2bow(text) for text in texts]

    print("Train Model")
    hdp = HdpModel(corpus, corpus_dict)

    print(hdp.print_topics(num_topics=50, num_words=20))

    hdp.save(args["target"])
示例#10
0
class HDPModel(Model, Transformer):
    def __init__(self, corpus=None, **kwargs):
        self._m = HdpModel(corpus, **kwargs)

    def fit(self, corpus):
        self._m.update(corpus)

    def transform(self, corpus):
        return self._m[corpus]

    @property
    def inst(self):
        return self._m
示例#11
0
    def hdpmodel(self, corpus_t, save=False, savename=None):
        """

        :param corpus_t:
        :param save:
        :param savename:
        :return:
        """
        print('using Hierarchical Dirichlet Process model...')
        hdpmodel = HdpModel(corpus=corpus_t, id2word=self.word_dict)
        if save:
            print('输出hdp模型到文件:{}'.format(savename))
            hdpmodel.save(savename)
        return hdpmodel
示例#12
0
    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        #self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics
示例#13
0
    def model_pcs(self, model_name, all_mashup_num, all_api_num):
        # 按照0-all——num得到的其实是按真实id的映射!!!
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if model_name == 'HDP':
            self.model = HdpModel(self.mashup_dow + self.api_dow, self.dct)
            self.num_topics = self.model.get_topics().shape[0]
        elif model_name == 'TF_IDF':
            self.model = TfidfModel(self.mashup_dow + self.api_dow)
            self.num_topics = len(self.dct)
        else:
            raise ValueError('wrong gensim_model name!')

        mashup_hdp_features = [
            self.model[mashup_info] for mashup_info in self.mashup_dow
        ]
        api_hdp_features = [self.model[api_info] for api_info in self.api_dow]

        self._mashup_hdp_features = np.zeros((all_mashup_num, self.num_topics))
        self._api_hdp_features = np.zeros((all_api_num, self.num_topics))
        for i in range(all_mashup_num):
            for index, value in mashup_hdp_features[i]:
                self._mashup_hdp_features[i][index] = value
        for i in range(all_api_num):
            for index, value in api_hdp_features[i]:
                self._api_hdp_features[i][index] = value
        return self._mashup_hdp_features, self._api_hdp_features
示例#14
0
    def run(self,
            kappa=1.0,
            tau=64.0,
            K=15,
            T=150,
            alpha=1,
            gamma=1,
            eta=0.01,
            scale=1.0,
            var_converge=0.0001,
            outputdir=None,
            random_state=0,
            *args,
            **kwargs):

        self.model = HdpModel(corpus=self.corpus,
                              id2word=self.dictionary,
                              kappa=kappa,
                              tau=tau,
                              K=K,
                              T=T,
                              alpha=alpha,
                              gamma=gamma,
                              eta=eta,
                              scale=scale,
                              var_converge=var_converge,
                              outputdir=outputdir,
                              random_state=random_state,
                              *args,
                              **kwargs)

        print("Done!\nCheckout hdp.model")
示例#15
0
def build_lda_models(course_corpus, course_dictionary, mapping, course_texts):
    # ==== Train Unsupervised LDA ====
    lda_model = LdaModel(corpus=course_corpus, id2word=course_dictionary)

    # ==== Train Unsupervised HDP-LDA ====
    hdp_model = HdpModel(corpus=course_corpus, id2word=course_dictionary)

    # ==== Train Author Topic Model ====
    author_to_doc = {}  # author topic LDA (authors are modules,lessons,items)
    for author_type in ["modules", "lessons", "items"]:
        entity_to_doc = mapping[author_type]
        for entity_name, entity_docs in entity_to_doc.items():
            author_to_doc["{}: {}".format(author_type[0].capitalize(),
                                          entity_name)] = entity_docs
    at_model = AuthorTopicModel(corpus=course_corpus,
                                id2word=course_dictionary,
                                author2doc=author_to_doc)

    # ==== Train Labeled LDA ====
    # explicitly supervised, labeled LDA
    llda_alpha = 0.01
    llda_beta = 0.001
    llda_iterations = 50
    llda_labels = []
    llda_corpus = []
    labelset = set()
    for course_text_id in range(0, len(course_texts)):
        doc_labels = []
        # get module label name
        for module_name, doc_vec in mapping["modules"].items():
            if course_text_id in doc_vec:
                doc_labels.append("M: {}".format(module_name))
                break

        # get lesson label name
        for lesson_name, doc_vec in mapping["lessons"].items():
            if course_text_id in doc_vec:
                doc_labels.append("L: {}".format(lesson_name))
                break

        for item_name, doc_vec in mapping["items"].items():
            if course_text_id in doc_vec:
                doc_labels.append("I: {}".format(item_name))
                break

        llda_labels.append(doc_labels)
        llda_corpus.append(course_texts[course_text_id])
        labelset = labelset.union(doc_labels)

    llda_model = LLDA(llda_alpha, llda_beta, K=len(llda_labels))
    llda_model.set_corpus(llda_corpus, llda_labels)
    llda_model.train(iteration=llda_iterations)

    # phi = llda.phi()
    # for k, label in enumerate(labelset):
    #     print ("\n-- label %d : %s" % (k + 1, label))
    #     for w in argsort(-phi[k + 1])[:10]:
    #         print("%s: %.4f" % (llda.vocas[w], phi[k + 1,w]))
    return lda_model, hdp_model, at_model, llda_model, llda_labels
示例#16
0
    def run_hdp(self, modelId, **kwargs):
        print(kwargs)
        hdpModel = HdpModel(self, self.dict, kwargs)
        hdpData = {modelId:
                   {'model':hdpModel,
                    'args':kwargs}}

        self.hdpModels.append(hdpData)
示例#17
0
def test_hdp():
    """Trains a HDP model and tests the html outputs."""
    corpus, dictionary = get_corpus_dictionary()
    hdp = HdpModel(corpus, dictionary.id2token)

    data = gensim_models.prepare(hdp, corpus, dictionary)
    pyLDAvis.save_html(data, 'index_hdp.html')
    os.remove('index_hdp.html')
示例#18
0
    def runModels(self, number_of_topics, corpus, dictionary, start, end):

        #do hdp model

        hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

        hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10)
        hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(hdptopics)

        #add results to total kept in a list
        #   addToResults(result_dict)

        #output results
        self.printResults(number_of_topics, hdptopics, 'hdp', start, end)

        #d lda model
        ldamodel = LdaModel(corpus=corpus,
                            num_topics=number_of_topics,
                            id2word=dictionary,
                            random_state=100,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True)

        ldamodel.save('lda' + number_of_topics + '.model')
        ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(ldatopics)
        #   addToResults(result_dict)
        self.printResults(number_of_topics, ldatopics, 'lda', start, end)

        visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

        location = os.path.join(pn, 'topic_model_results')

        #visualize outputs in html
        pyLDAvis.save_html(
            visualisation,
            os.path.join(
                location, 'LDA_Visualization' + str(number_of_topics) + "_" +
                start + "_" + end + '.html'))
示例#19
0
 def gensimTopicModelingAnalysis(self, n):
     files = glob.glob(
         "/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/*.txt")
     files = sorted(
         files,
         key=lambda x: int(
             x.split(
                 '/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/Cluster'
             )[1].split('_')[0]))
     with open("/Users/advaitbalaji/Desktop/ListofSortedClusters.txt",
               "w") as of:
         for f in files:
             of.writelines(f + "\n")
     texts, clusters = n.readMultipleFileLineWise(files)
     dictionary = Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
     print(hdpmodel.show_topics())
示例#20
0
    def build_hdp(self):
        """Builds an HDP model of the corpus.
        """

        print("building HDP model...")
        start = time.time()
        self.hdp = HdpModel(corpus=self.get_bows(), id2word=self.dict)
        end = time.time()
        print("HDP finished! {:.2f} seconds".format(end - start))
示例#21
0
def hdp(corpus,dictionary,docs,score=False):
    print('Traiing for {} documents ......'.format(len(corpus)))
    hdpmodel = HdpModel(corpus = corpus,id2word = dictionary)
    if score:
        print('calculating coherence socre for {} documents ......'.format(len(docs)))
        coherence_model = CoherenceModel(model=hdpmodel, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        print('\nCoherence Score: ', coherence_score)
        return hdpmodel,coherence_score
    return hdpmodel
def comparison(texts):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LSI Model output')
    print(lsimodel.show_topics())

    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
    print('hdp model output')
    print(hdpmodel.show_topics())

    ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LDA Model output')
    print(ldamodel.show_topics())


    pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

    lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]

    hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

    lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence()

    def evaluate_bar_graph(coherences, indices):
        assert len(coherences) == len(indices)
        n = len(coherences)
        x = np.arange(n)
        plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
        plt.xlabel('Models')
        plt.ylabel('Coherence Value')
        plt.show()

    evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
示例#23
0
    def createHDP(self, fileName = '', modelName= ''):
        '''
        fileName -> file for the dictionary (.dict) and corpus (.mm) files 
        modelName -> model name for LDA to save to disk
        ldaPasses -~ number of passes, 10 default
        topicNum -> number of topics to generate, 100 by default
        '''
        if fileName == '':
            fileName = self.__fileName
            
        if modelName == '':
            modelName = self.__fileName
        
        dict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
        mm = corpora.MmCorpus(self.__destination+fileName+'.mm')

        hdp = HdpModel(corpus=mm, id2word=dict)
        hdp.save(self.__destination+modelName+'.hdp')
        print hdp
        print 'Created HDP model %s'%self.__fileName 
示例#24
0
    def createHDP(self, fileName='', modelName=''):
        '''
        fileName -> file for the dictionary (.dict) and corpus (.mm) files 
        modelName -> model name for LDA to save to disk
        ldaPasses -~ number of passes, 10 default
        topicNum -> number of topics to generate, 100 by default
        '''
        if fileName == '':
            fileName = self.__fileName

        if modelName == '':
            modelName = self.__fileName

        dict = corpora.Dictionary.load(self.__destination + fileName + '.dict')
        mm = corpora.MmCorpus(self.__destination + fileName + '.mm')

        hdp = HdpModel(corpus=mm, id2word=dict)
        hdp.save(self.__destination + modelName + '.hdp')
        print hdp
        print 'Created HDP model %s' % self.__fileName
示例#25
0
    def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True,
              model='lda'):
        """
        Trian the topic cluster model.
        Input value: data: pd.DataFrame format ['id','title','content','summary']
                     num_topics: (int) the number of topics
                     iterations: (int) total number of iteration times
        example:
        >>> lda = LDA_Model
        >>> lda.train(text)
        """
        data = load_data(str(path + '/output/data.csv'))
        self.original_data = data
        self.text = list(data['content'])
        self.num_topics = num_topics
        self.iterations = iterations
        self.model_name = model

        print('preprocessing...')
        self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words)

        self.id2word = Dictionary(self.token)
        self.corpus = [self.id2word.doc2bow(text) for text in self.token]
        if tfidf == True:
            print('calculate tfidf...')
            tfidf_model = TfidfModel(self.corpus)
            self.corpus = tfidf_model[self.corpus]

        if model == 'lda':
            self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics,
                                  iterations=self.iterations)
        if model == 'lsi':
            self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics)
        if model == 'hdp':
            self.model = HdpModel(corpus=self.corpus, id2word=self.id2word)
            self.num_topics = self.model.get_topics().shape[0]

        self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words'])
        self.doc_topic = self._doc_topic()
        self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id'])
        self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])
def add_topics(args):
    print(args)

    nlp = spacy.load("en", disable=["parser", "ner"])

    def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        allowed_postags = set(allowed_postags)
        docs = nlp.pipe(texts)
        text_tokens = []
        for doc in docs:
            tokens = [token.lemma_ for token in doc if
                      token.pos_ in allowed_postags and not token.is_punct and not token.is_stop]
            text_tokens.append(tokens)
        return text_tokens

    model = HdpModel.load(args["topic_model"])
    corpus_dict = model.id2word

    topics = model.show_topics(num_topics=args["num_topics"], num_words=args["num_terms"], log=False, formatted=False)

    topics_to_save = []
    for topic in topics:
        topic_dict = {}
        topic_terms = ", ".join([t[0] for t in topic[1]])
        topic_dict["topic_id"] = int(topic[0])
        topic_dict["terms"] = topic_terms

        topics_to_save.append(topic_dict)

    database = args["database"]
    dataset_db = f"sqlite:///{database}"
    with dataset.connect(dataset_db, engine_kwargs=engine_kwargs) as db:
        db.create_table("corpus_topics")

        topic_ids = db["corpus_topics"].insert_many(topics_to_save)
        print(topic_ids)

        print(topics_to_save)

        batch = []
        for sentence in db['sentence']:
            batch.append(sentence)

            if len(batch) == args["batch_size"]:
                insert_corpus_sentence_links(batch, corpus_dict, db, model, tozenize)
                batch = []

        if len(batch) > 0:
            insert_corpus_sentence_links(batch, corpus_dict, db, model, tozenize)

        db["corpus_topics_sentences"].create_index(['sentence_id'])
        db["corpus_topics_sentences"].create_index(['topic_id'])
示例#27
0
    def model_pcs(self,model_name,LDA_topic_num=None):
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if self.mashup_only:
            if self.strict_train:
                train_corpus = self.train_mashup_dow
            else:
                train_corpus = self.mashup_dow
        else:
            if self.strict_train:
                train_corpus = self.train_mashup_dow + self.train_api_dow
            else:
                train_corpus = self.mashup_dow + self.api_dow

        if model_name=='HDP':
            self.model = HdpModel(train_corpus, self.dct)
            self.num_topics = self.model.get_topics ().shape[0]
            print('num_topics',self.num_topics)
        elif model_name=='TF_IDF':
            self.model =TfidfModel (train_corpus)
            self.num_topics=len(self.dct)
        elif model_name=='LDA':
            if LDA_topic_num is None:
                self.model = LdaModel(train_corpus)
            else:
                self.model = LdaModel(train_corpus,num_topics=LDA_topic_num)
            self.num_topics = self.model.get_topics ().shape[0]

        else:
            raise ValueError('wrong gensim_model name!')

        # 使用模型处理文本,再转化为标准的np格式(每个topic上都有上)
        # print(self.mashup_dow)
        self.mashup_features=[self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature
        # print(self.mashup_features)
        print('self.mashup_features, num:', len(self.mashup_features))
        zero_num1 = sum([1 if len(mashup_feature)==0 else 0 for mashup_feature in self.mashup_features])
        print('zero_num1',zero_num1)
        for i in range(len(self.mashup_features)):
            if len(self.mashup_features[i])==0:
                print(self.mashup_dow[i])

        self.api_features = [self.model[api_info] for api_info in self.api_dow]
        # print('when model-pcs,len of mashup_features and api_features:{},{}'.format(len(mashup_features),len(api_features)))
        self._mashup_features=np.zeros((meta_data.mashup_num, self.num_topics))
        self._api_features = np.zeros((meta_data.api_num, self.num_topics))
        for i in range(meta_data.mashup_num): # 部分维度有值,需要转化成规范array
            for index,value in self.mashup_features[i]:
                self._mashup_features[i][index]=value
        for i in range(meta_data.api_num):
            for index,value in self.api_features[i]:
                self._api_features[i][index]=value
        return self._mashup_features, self._api_features
示例#28
0
    def set_model(self, lang: str, data_version: int,
                  dictionary_version: float, model_version: str,
                  param_name: str, param_version: int, model_file_path: str,
                  language_processed_data: list):
        # Make a index to word dictionary.
        logging.info("---- Creating HDP model")
        temp = self.essentials.dictionary[0]
        model = HdpModel(corpus=self.essentials.corpus,
                         id2word=self.essentials.dictionary.id2token)
        # , alpha="symmetric",
        # eta=self.beta, chunksize=self.chunk_size)
        model.save(model_file_path)
        self.model = model
        logging.info("---- HDP model is created")

        metrics = self.get_model_evaluation_metrics(language_processed_data)
        parameters = self.get_model_parameters()
        self.write_model_evaluation_metrics(lang, data_version,
                                            dictionary_version, model_version,
                                            param_name, param_version, metrics,
                                            parameters)
        return
示例#29
0
def try_news_cluster():

    docs = feed_doc()
    df_threshold_lower = 50
    df_threshold_upper = 500
    dictionary = corpora.Dictionary(doc for doc in docs)
    print 'dictionary ready'
    low_df = [
        tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
        if docfreq <= df_threshold_lower
    ]
    high_df = [
        tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
        if docfreq > df_threshold_upper
    ]
    dictionary.filter_tokens(low_df + high_df)
    dictionary.compactify()
    corpus = [dictionary.doc2bow(doc) for doc in feed_doc()]
    print 'corpus ready'
    hdp = HdpModel(corpus, dictionary)
    for topic in hdp.print_topics(num_topics=50, num_words=20):
        print topic
示例#30
0
    def stream_topic_model(self,
                           topic: Topic,
                           dictionary: corpora.Dictionary = None,
                           corpus: IndexedCorpus = None,
                           num_topics=20,
                           max_topics_per_doc=5):
        # load dictionary and corpus, if necessary
        if not dictionary:
            dictionary = self.load_dictionary()
            logger.warning(
                "the default dictionary was loaded from file. "
                "You should keep an instance in memory instead of calling this in a loop..."
            )
        if not corpus:
            corpus = JsonLinesCorpus(self.file_corpus)
            logger.warning(
                "the default corpus was loaded from file. You should provide a "
                "reduced corpus to increase performance (see corpus2corpus)")
        # build the model
        logger.info(
            "building a topic model with {} topics for {} documents in topic '{}'"
            .format(num_topics, len(corpus), topic.topic_id))
        t0 = time.time()
        if self.model == "lda":
            model = LdaMulticore(corpus,
                                 id2word=dictionary.id2token,
                                 num_topics=num_topics,
                                 passes=2,
                                 iterations=50,
                                 chunksize=2000,
                                 workers=self.n_threads)
        elif self.model == "hdp":
            # T = overall topic limit, K = max topics per document
            model = HdpModel(corpus,
                             id2word=dictionary.id2token,
                             T=num_topics,
                             K=max_topics_per_doc)
        else:
            raise ValueError("Unknown model identifier '{}'".format(
                self.model))
        t1 = time.time()

        # serialize
        logger.info(
            "building the model took {:.1f} s. Serializing model...".format(
                t1 - t0))
        output_path = self._get_model_path(topic)
        with util.open_by_ext(output_path, 'wb') as fp:
            pickle.dump(model, fp, protocol=4)
            logger.info(
                "model dump finished, took {:.1f} s".format(time.time() - t1))
def hierarchical_dirichlet_process_topic_extraction():
    """
    Function performs topic extraction on Tweets using the Gensim HDP model.

    :return: None.
    """
    from gensim.models import HdpModel

    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model.
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=1000,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(slo_feature_series)
    tf_feature_names = tf_vectorizer.get_feature_names()

    log.info(
        "\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix."
    )
    log.info(f"{tf}\n")
    log.info(
        "\n.get_feature_names - Array mapping from feature integer indices to feature name"
    )
    log.info(f"{tf_feature_names}\n")

    # Train the HDP model.
    hdp = HdpModel(corpus, dictionary)
    time.sleep(3)

    # # For use as wrapper with Scikit-Learn API.
    # model = HdpTransformer(id2word=dictionary)
    # distribution = model.fit_transform(corpus)

    # Display the top words for each topic.
    topic_info = hdp.print_topics(num_topics=20, num_words=10)

    for topic in topic_info:
        print(topic)
    def get_num_topics(self):

        self.rev_train['title'] = self.strip_newline(self.rev_train.title)
        self.rev_test['title'] = self.strip_newline(self.rev_test.title)
        # rev_train.text[21:22].values

        words_tr = list(self.sent_to_words(self.rev_train.title))
        words_te = list(self.sent_to_words(self.rev_test.title))

        words_tr = self.remove_stopwords(words_tr)

        bigram_tr, trigram_tr = self.bigrams(words_tr)

        trigrams_tr = [trigram_tr[bigram_tr[review]] for review in words_tr]

        lemma_lg = self.lemmatization(trigrams_tr)

        with open(os.path.join('.', 'data', 'lemma_lg.pkl'), 'wb') as f:
            pickle.dump(lemma_lg, f)

        id2word_lg = gensim.corpora.Dictionary(lemma_lg)
        id2word_lg.filter_extremes(no_below=2, no_above=0.6)
        id2word_lg.compactify()
        id2word_lg.save(os.path.join('.', 'data', 'train_dict_lg'))
        corpus_lg = [id2word_lg.doc2bow(text) for text in lemma_lg]

        with open(os.path.join('.', 'data', 'corpus_lg.pkl'), 'wb') as f:
            pickle.dump(corpus_lg, f)

        hdp = HdpModel(corpus_lg, id2word_lg, chunksize=100)
        n_topics = len(hdp.print_topics())
        hdptopics = hdp.print_topics(num_topics=n_topics)

        for tp in hdptopics:
            print(tp)

        return n_topics
示例#33
0
文件: docs.py 项目: rafunchik/shrimps
                   id2word=dictionary,
                   update_every=5,
                   chunksize=10000,
                   passes=100)
    lda.save('/tmp/model.lda')
else:
    lda = LdaModel.load('/tmp/model.lda')
lda.show_topics()
topics_matrix = lda.show_topics(formatted=False, num_words=7)

print(topics_matrix)
print(len(topics_matrix))

for topic in topics_matrix:
    i = topic[1]
    print([str(word) for word in i])
#
# topics_matrix = np.array(topics_matrix)
#
# topic_words = topics_matrix[:, :, 1]
# for i in topic_words:
#     print([str(word) for word in i])


# otro modelo mas para categorizar documentos, Hierarchical Dirichlet Process
print("HDP")
model = HdpModel(corpus, id2word=dictionary)
model.show_topics(log=True, topics=5)

#  ver https://radimrehurek.com/gensim/tut2.html
__author__ = 'rbshaffer'

from gensim.models import HdpModel
from gensim.corpora import BleiCorpus
from gensim.corpora import Dictionary

corpus = BleiCorpus(fname='/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_corpus_07242015.lda-c',
                    fname_vocab='/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_corpus_07242015.lda-c.vocab')
dictionary = Dictionary.load('/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_dic_07242015.lda-c.dic')
hdp_model = HdpModel(corpus=corpus, id2word=dictionary, max_time=28800)
hdp_model.save('/home/rbshaffer/Desktop/hdp_output_0726015.pydata')