예제 #1
0
    def get_lda_score_eval2(self, dictionary: typing.Dict[str, str], bow_corpus) -> list:
        """LDA model and coherence score."""
        # lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=self.topic_num, id2word=dictionary, passes=10,  update_every=1, random_state = 300, alpha=self.alpha, eta=self.eta)

        # the trained model
        lda_model = LdaTransformer(num_topics=self.topic_num, id2word=dictionary, iterations=10, random_state=300, alpha=self.alpha, eta=self.eta, scorer= 'mass_u')

        # The topic distribution for each input document.
        docvecs = lda_model.fit_transform(bow_corpus)
        # pprint(lda_model.print_topics())

        return lda_model, docvecs
예제 #2
0
def train_lda_model():
    data = pd.read_csv("corpus/part-00000-8274d92c-217e-4ce7-80c7-50c52a899545-c000.csv", header=None)
    data.columns = ["sentence"]
    sentence = data["sentence"].values.tolist()
    sentence = map(lambda x: x.split(), sentence)
    dct = Dictionary.load('data/lda_dict')
    # dct = Dictionary(sentence)
    # dct.save('./data/lda_dict')
    sentence = list(map(lambda x: dct.doc2bow(x), sentence))
    model = LdaTransformer(num_topics=100,
                              id2word=dct,
                              random_state=1)
    model.fit(sentence)
    joblib.dump(model, './data/lda.model')
예제 #3
0
def lda_gensim_to_sci(data, sections, n_topics, **kw):
    """Wrap gensim LDA model for scikit-learn."""
    dat = get_nips_combined(sections, data)
    d, bow = lda_get_dictionary(d, **kw)

    args = {**lda_gensim_defaults, **kw}
    args.pop('per_word_topics')
    args['num_topics'] = n_topics
    return LdaTransformer(id2word=d, **args)
예제 #4
0
def fit_model(corpora, dictionary, topicNum, beta):
    corpus = [dictionary.doc2bow(text) for text in corpora]

    model = LdaTransformer(id2word=dictionary, num_topics=topicNum, alpha='auto', eta=beta, iterations=100, random_state=2019)
    lda = model.fit(corpus)
    #docvecs = lda.transform(corpus)
    coherence = evaluateModel(lda.gensim_model, corpora)

    try:
        cm = CoherenceModel(model=lda.gensim_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        u_mass = cm.get_coherence()

        cm = CoherenceModel(model=lda.gensim_model, texts=corpora, coherence='c_uci')
        c_uci = cm.get_coherence()

        cm = CoherenceModel(model=lda.gensim_model, texts=corpora, coherence='c_npmi')
        c_npmi = cm.get_coherence()

        saveModelConfigs(lda, coherence, u_mass, c_uci, c_npmi, config_path)
    except:
        saveModelConfigs(lda, coherence, "Invalid", "Invalid", "Invalid", config_path)
    #return lda.gensim_model, docvecs
    return lda.gensim_model
    writer.writerow([str(lda_model.num_topics), str(lda_model.eta), str(max_bleu), str(max_jaccard), str(max_cos), str(max_fscore)])
	

def evaluateModel(lda_model, topic_info, term_emb, mode):
  if mode == 1:
    max_bleu, max_jaccard, max_cos, max_fscore = main_topicDistr(lda_model, topic_info)
  elif mode == 2:
    max_bleu, max_jaccard, max_cos, max_fscore = main_LDA_avgEmb(lda_model, topic_info, term_emb)
  elif mode == 3:
    max_bleu, max_jaccard, max_cos, max_fscore = main_LDA_maxEmb(lda_model, topic_info, term_emb)
  elif mode == 4:
    max_bleu, max_jaccard, max_cos, max_fscore = main_Word2Vec_AvgEmb(lda_model, topic_info)
  elif mode == 5:
    max_bleu, max_jaccard, max_cos, max_fscore = main_Word2Vec_MaxEmb(lda_model, topic_info)
  write_results_to_file("/home/norberteke/PycharmProjects/Thesis/data/SO_simulation_results_2.csv", lda_model, max_bleu, max_jaccard, max_cos, max_fscore)

k = []
for i in range(10,51):
	k.append(i)
	
beta = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

for topic_num in k:
  print("----- Progress: k= ", topic_num ,"----")
  for b in beta:
    model = LdaTransformer(id2word=dictionary, num_topics = topic_num, alpha='auto', eta = b, iterations=100, random_state=2019)
    lda = model.fit(corpus)
    term_topic_matrix = lda.gensim_model.get_topics()
    term_emb = pd.DataFrame(term_topic_matrix, columns=terms)
    topic_info = get_topic_info(lda.gensim_model, corpus, dictionary)
    evaluateModel(lda.gensim_model, topic_info, term_emb, mode = 2)
예제 #6
0
dictionary.save(
    "/home/norberteke/PycharmProjects/Thesis/data/GH_past_full_activity_gensimDictionary.dict"
)

with open(
        "/home/norberteke/PycharmProjects/Thesis/data/GH_past_full_activity_corpus.txt",
        'w') as f:
    for text in texts:
        f.write(str(text) + "\n")

corpus = [dictionary.doc2bow(text) for text in texts]
# output_fname = get_tmpfile("/home/norberteke/PycharmProjects/Thesis/data/SO_recent_full_activity_gensimCorpus.mm")
# MmCorpus.serialize(output_fname, corpus)

model = LdaTransformer(id2word=dictionary,
                       alpha='auto',
                       iterations=100,
                       random_state=2019)

# The list of hyper-parameters to optimize. For each one define the bounds
space = [Integer(20, 500, name='num_topics'), Real(0.001, 200, name='eta')]


# this decorator allows your objective function to receive a the parameters as keyword arguments
@use_named_args(space)
def objective(**params):
    model.set_params(**params)
    lda = model.fit(corpus)
    coherence = evaluateModel(lda.gensim_model)

    try:
        cm = CoherenceModel(model=lda.gensim_model,
예제 #7
0
    def __init__(self,
                 columns,
                 n_topics=20,
                 n_iter=10,
                 random_state=0,
                 lib='sklearn',
                 trained_model=None,
                 start=2,
                 stop=21,
                 step=1,
                 stop_words='english',
                 max_features=None):
        self.model = None
        self.trained_model = None
        self.lib = None
        self.columns = None
        self.stop_words = stop_words
        self.max_features = max_features

        if columns:
            if isinstance(columns, list) and all(
                    isinstance(col, str) for col in columns):
                self.columns = columns
            else:
                raise TypeError(
                    "Columns has to be list of strings . Column {} is of type {}"
                    .format(columns, type(columns)))
        else:
            raise ValueError(
                "You have to specify which columns you want to vectorize")

        if trained_model:
            warnings.warn(
                "Trained models are not trained again. Please make sure to only input the column(s) "
                "that the model was trained on", UserWarning)
            self.trained_model = trained_model
            self.random_state = self.trained_model.random_state
            if isinstance(self.trained_model, type(LDA_skl())):
                self.n_topics = self.trained_model.n_components
                self.n_iter = self.trained_model.max_iter
            else:
                self.n_topics = self.trained_model.num_topics
                self.n_iter = self.trained_model.iterations
        else:
            if n_topics == 'auto':
                self.n_topics = n_topics
                params = [start, stop, step]
                for number in params:
                    try:
                        val = int(number)
                        if val < 2:
                            raise ValueError(
                                "Number of topic has to be a positive. Received: {}"
                                .format(number))
                        break
                    except TypeError:
                        raise TypeError(
                            "That's not an int! Received: {}".format(
                                type(number)))
                if stop < start:
                    raise ValueError(
                        "Stop value has to be higher than the start value. Received: {}"
                        .format(n_topics))
                self.start = start
                self.stop = stop
                self.step = step
            else:
                if not isinstance(n_topics, int):
                    raise TypeError(
                        "Number of topic has to be an integer. Received: {}".
                        format(type(n_topics)))
                if n_topics < 2:
                    raise ValueError(
                        "Number of topics has to be at least 2. Received: {}".
                        format(n_topics))
                self.n_topics = n_topics

            if not isinstance(n_iter, int):
                raise TypeError(
                    "Random_state has to be a integer. Received: {}".format(
                        type(n_iter)))
            if n_iter < 1:
                raise ValueError(
                    "Random_state has to be at least 1. Received: {}".format(
                        n_iter))
            self.n_iter = n_iter

            if not isinstance(random_state, int):
                raise TypeError(
                    "Random_state has to be a integer. Received: {}".format(
                        type(random_state)))
            if random_state < 0:
                raise ValueError(
                    "Random_state has to be positive or zero. Received: {}".
                    format(random_state))
            self.random_state = random_state

            if not isinstance(lib, str):
                raise TypeError("Lib has to be a string. Received: {}".format(
                    type(lib)))
            if lib == 'sklearn':
                self.model = \
                    LDA_skl(n_components=self.n_topics, max_iter=self.n_iter, random_state=self.random_state)
            elif lib == 'gensim':
                self.model = \
                    LdaTransformer(num_topics=self.n_topics, iterations=self.n_iter, random_state=self.random_state)
            else:
                raise ValueError(
                    "The supported libraries are sklearn and gensim. Received: {}"
                    .format(lib))
        self.lib = lib