예제 #1
0
 def training_callback(autoencoder, epoch, lr, loss, perplexity):
     if verbose_mode:
         decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
         topics = [[
             reverse_vocab[item.item()] for item in topic
         ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
         cm = CoherenceModel(
             topics=topics,
             corpus=corpus,
             dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
             coherence="u_mass",
         )
         coherence = cm.get_coherence()
         coherences = cm.get_coherence_per_topic()
         for index, topic in enumerate(topics):
             print(
                 str(index) + ":" + str(coherences[index]) + ":" +
                 ",".join(topic))
         print(coherence)
     else:
         coherence = 0
     writer.add_scalars(
         "data/autoencoder",
         {
             "lr": lr,
             "loss": loss,
             "perplexity": perplexity,
             "coherence": coherence,
         },
         global_step=epoch,
     )
예제 #2
0
 def vect2gensim(self, vectorizer, dtmatrix):
     # transform sparse matrix into gensim corpus and dictionary
     corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False)
     dictionary = Dictionary.from_corpus(
         corpus_vect_gensim,
         id2word=dict(
             (id, word) for word, id in vectorizer.vocabulary_.items()))
     return (corpus_vect_gensim, dictionary)
예제 #3
0
 def test_hdp_model(self):
     print "Building model"
     d = Dictionary.from_corpus(self.k_corpus)
     model = models.hdpmodel.HdpModel(self.k_corpus, d, T=500)
     #model = models.ldamulticore.LdaMulticore(self.k_corpus, num_topics=479, workers=self.t-1)
     #model = models.ldamodel.LdaModel(self.k_corpus, num_topics=479, passes=100)
     model.save("kmer_hdp.k%s" % self.ks, ignore=['corpus'])
     
     print "Done"
예제 #4
0
class tip_rec:

	def __init__(self, num_topics = 15):
		self.numtopics = num_topics
		self.topic_dict = dict(enumerate(np.zeros(num_topics)))
		self.user_dict = {}
		self.model = None
		self.worddict = {}
		self.mydict = None


	def train(self, df):
		self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()}
		cv = CV(stop_words='english')
		X = cv.fit_transform(df['context'])
		vocab = cv.vocabulary_.keys()
		self.worddict=dict([(i, s) for i, s in enumerate(vocab)])
		self.mydict = Dictionary()
		self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict)
		self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict)
		for i in df.iterrows():
			if i[1]['context'] == '':
				continue
			else:
				values = new_model[mydict.doc2bow(i[1]['context'].split())]
				for val in values:
					if val[0] in user_dict[i[1].sender].keys():
						if i[1].amt == '':
							continue
						user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt)
						continue
					user_dict[i[1].sender][val[0]] = val[1]
		for i in user_dict.keys():
			norm_const = sum(user_dict[i].values())
			for j in user_dict[i].keys():
				user_dict[i][j] = user_dict[i][j]/norm_const

	def predict(self, text, username = ''):
		topics = self.model[self.mydict.doc2bow(text.split())]
		doc_aff = np.zeros(self.numtopics)
		for i in topics:
			doc_aff[i[0]] = i[1]
		if username == '':
			returndict = {}
			for user in self.user_dict.keys():
				user_aff = np.array(self.user_dict[user].values())    
				score = np.linalg.norm(user_aff - doc_aff)
				returndict[user] = score
			return returndict
		else:
			user_aff = np.array(self.user_dict[username].values())    
			score = np.linalg.norm(user_aff - doc_aff)
			return (username, score)
예제 #5
0
    def start(self, verbose=True):
        if verbose: print("Preprocessing dataset...")
        if verbose: print('Creating Tokens For Lemmetization...')
        t0 = time()
        # Tokenize
        data_tokens = self.remove_stopwords_(
            list(self.preprocess_(self.data[:])))
        if verbose: print("done in %0.3fs." % (time() - t0))
        if verbose: print('Lemmetization in progress using spaCy...')
        t0 = time()
        # Lemmatization function using spaCy
        self.lemmatized_text = self.lemmatization_(data_tokens)
        token_text = []
        for doc in self.lemmatized_text:
            token_text.append(doc.split())
        if self.bigrams:
            self.token_text = self.make_ngrams_(token_text)
            input_text = []
            for doc in self.token_text:
                input_text.append(" ".join(doc))
        else:
            self.token_text = token_text
            input_text = self.lemmatized_text

        if verbose: print("done in %0.3fs." % (time() - t0))
        # Vectorizing dataset for use with LDA algorithm
        if verbose: print('Vectorizing dataset...')
        t0 = time()
        self.vectorizer = CountVectorizer(
            analyzer='word',
            min_df=self.min_df,
            stop_words=self.stopwords,
            lowercase=True,
            token_pattern=self.token_pattern,
            ngram_range=self.ngram_range,
            #max_features=50000,
            max_df=self.max_df)
        self.lda_input_data = self.vectorizer.fit_transform(input_text)
        self.gensim_corpus_vect = gensim.matutils.Sparse2Corpus(
            self.lda_input_data, documents_columns=False)
        self.id2word = Dictionary.from_corpus(
            self.gensim_corpus_vect,
            id2word=dict((idn, word)
                         for word, idn in self.vectorizer.vocabulary_.items()))

        if verbose: print(self.vectorizer)
        #checking sparcity because it was in my tutorial (is this an important thing?)
        data_dense = self.lda_input_data.todense()
        if verbose:
            print('Sparcity: ',
                  ((data_dense > 0).sum() / data_dense.size) * 100, '%')
        if verbose: print("done in %0.3fs." % (time() - t0))
예제 #6
0
def vect2gensim(vectorizer, dtmatrix):
    # transform sparse matrix into gensim corpus and dictionary
    start = time()
    corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix,
                                                       documents_columns=False)
    dictionary = Dictionary.from_corpus(
        corpus_vect_gensim,
        id2word=dict(
            (id, word) for word, id in vectorizer.vocabulary_.items()))
    end = time()
    print(
        "Transform vector model to gensim format ... done in {0:0.3f} miliseconds"
        .format((end - start) * 1000))

    return (corpus_vect_gensim, dictionary)
예제 #7
0
 def score(self, X, y=None, sample_weight=None) -> float:
     # TODO this needs further testing for correctness, WIP
     if self.autoencoder is None:
         raise NotFittedError
     self.autoencoder.eval()
     corpus = Sparse2Corpus(X, documents_columns=False)
     decoder_weight = self.autoencoder.decoder.linear.weight.detach().cpu()
     id2word = {index: str(index) for index in range(X.shape[1])}
     topics = [[str(item.item()) for item in topic]
               for topic in decoder_weight.topk(
                   min(self.score_num, X.shape[1]), dim=0)[1].t()]
     cm = CoherenceModel(topics=topics,
                         corpus=corpus,
                         dictionary=Dictionary.from_corpus(corpus, id2word),
                         coherence='u_mass')
     return cm.get_coherence()
예제 #8
0
def vectorize_text(df):
    cv = CountVectorizer(preprocessor=preprocessor,
                         stop_words=stop,
                         lowercase=True,
                         decode_error='replace',
                         ngram_range=(1, 2),
                         max_df=0.9,
                         min_df=10,
                         max_features=10000)
    sparse_df = cv.fit_transform(df['all_text'])
    gensim_df = matutils.Sparse2Corpus(sparse_df.T)

    vocab = cv.get_feature_names()
    id2word = {i: s for i, s in enumerate(vocab)}
    dictionary = Dictionary.from_corpus(gensim_df,
                                        id2word)  # for coherence model
    return gensim_df, id2word, dictionary
예제 #9
0
def filterDS(data, featureNames, no_below=5, no_above=0.3, keep_n=1200):
    corpus = gensim.matutils.Dense2Corpus(data, documents_columns=False)
    dictionary = Dictionary.from_corpus(
        corpus, {i: w
                 for i, w in enumerate(featureNames)})

    if len(featureNames) > keep_n:
        dictionary.filter_extremes(no_below=no_below,
                                   no_above=no_above,
                                   keep_n=keep_n)
    dw = dictionary.values()

    newWId = [i for i, w in enumerate(featureNames) if w in dw]
    words = [featureNames[i] for i in newWId]
    dictionary.compactify()
    data = data[:, newWId]
    return data, words
예제 #10
0
def get_topics(cv, train_data):
    """
    Uses gensim to perform topic modeling.

    Parameters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.

    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    """

    td_gensim = Sparse2Corpus(train_data, documents_columns=False)
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)

    lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda.top_topics(corpus=td_gensim, num_words=5)

    return topics
예제 #11
0
def ldaperplexity(train, test, topics):
    corpus = gensim.matutils.Dense2Corpus(train.astype(int),
                                          documents_columns=False)
    corpusTest = gensim.matutils.Dense2Corpus(test.astype(int),
                                              documents_columns=False)
    dictionary = Dictionary.from_corpus(corpus)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        c = Chrono().start()
        lda = runLda(corpus, dictionary, topics=topics)
        c.end()

    corpus_words = sum(cnt for document in corpusTest for _, cnt in document)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        perwordbound = lda.log_perplexity(corpusTest)
    print(
        "LDA %.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words"
        % (perwordbound, numpy.exp2(-perwordbound), len(corpusTest),
           corpus_words))
    return numpy.exp2(-perwordbound), c.elapsed()
예제 #12
0
def get_topics(cv, train_data):
    '''
    Uses gensim to perform topic modeling.
    
    Paramters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.
    
    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    '''
    #Create the gensim corpus from train data
    td_gensim = Sparse2Corpus(train_data)
    #Create vocab dictionary
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)
    #Create LDA model with specified parameters
    lda_gs = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda_gs.top_topics(corpus=td_gensim, num_words=5)
    return topics
예제 #13
0
def get_topics(cv, train_data):
    """
    Uses gensim to perform topic modeling.

    Parameters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.

    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    """

    td_gensim = Sparse2Corpus(train_data, documents_columns=False)
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)

    lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda.top_topics(corpus=td_gensim, num_words=5)

    return topics
예제 #14
0
 def training_callback(autoencoder, epoch, lr, loss, perplexity):
     decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
     topics = [
         [reverse_vocab[item.item()] for item in topic]
         for topic in decoder_weight.topk(top_words, dim=0)[1].t()
     ]
     cm = CoherenceModel(
         topics=topics,
         corpus=corpus,
         dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
         coherence='u_mass'
     )
     coherence = cm.get_coherence()
     coherences = cm.get_coherence_per_topic()
     for index, topic in enumerate(topics):
         print(str(index) + ':' + str(coherences[index]) + ':' + ','.join(topic))
     print(coherence)
     writer.add_scalars('data/autoencoder', {
         'lr': lr,
         'loss': loss,
         'perplexity': perplexity,
         'coherence': coherence,
     }, global_step=epoch)
예제 #15
0
def main(cuda, batch_size, epochs, top_words, testing_mode, verbose_mode):
    print("Loading input data")
    # TODO fix relative paths
    data_train = load_npz("data/train.txt.npz")
    data_val = load_npz("data/test.txt.npz")
    corpus = Sparse2Corpus(data_train, documents_columns=False)
    with open("data/vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [
        reverse_vocab[index] for index in range(len(reverse_vocab))
    ]
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        if verbose_mode:
            decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
            topics = [[
                reverse_vocab[item.item()] for item in topic
            ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
            cm = CoherenceModel(
                topics=topics,
                corpus=corpus,
                dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
                coherence="u_mass",
            )
            coherence = cm.get_coherence()
            coherences = cm.get_coherence_per_topic()
            for index, topic in enumerate(topics):
                print(
                    str(index) + ":" + str(coherences[index]) + ":" +
                    ",".join(topic))
            print(coherence)
        else:
            coherence = 0
        writer.add_scalars(
            "data/autoencoder",
            {
                "lr": lr,
                "loss": loss,
                "perplexity": perplexity,
                "coherence": coherence,
            },
            global_step=epoch,
        )

    ds_train = CountTensorDataset(data_train)
    ds_val = CountTensorDataset(data_val)
    autoencoder = ProdLDA(in_dimension=len(vocab),
                          hidden1_dimension=100,
                          hidden2_dimension=100,
                          topics=50)
    if cuda:
        autoencoder.cuda()
    print("Training stage.")
    ae_optimizer = Adam(autoencoder.parameters(), 0.0001, betas=(0.99, 0.999))
    train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        update_callback=training_callback,
        sampler=WeightedRandomSampler(torch.ones(data_train.shape[0]), 20000),
        num_workers=4,
    )
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [[reverse_vocab[item.item()] for item in topic]
              for topic in decoder_weight.topk(top_words, dim=0)[1].t()]
    cm = CoherenceModel(
        topics=topics,
        corpus=corpus,
        dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
        coherence="u_mass",
    )
    coherence = cm.get_coherence()
    coherences = cm.get_coherence_per_topic()
    for index, topic in enumerate(topics):
        print(
            str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic))
    print(coherence)
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag="feature_embeddings",
        )
    writer.close()
예제 #16
0
class tip_rec:
    def __init__(self, num_topics=15):
        self.numtopics = num_topics
        self.topic_dict = dict(enumerate(np.zeros(num_topics)))
        self.user_dict = {}
        self.model = None
        self.worddict = {}
        self.mydict = None

    def train(self, df):
        self.user_dict = {
            el: self.topic_dict.copy()
            for el in df.sender.unique()
        }
        cv = CV(stop_words='english')
        X = cv.fit_transform(df['context'])
        vocab = cv.vocabulary_.keys()
        self.worddict = dict([(i, s) for i, s in enumerate(vocab)])
        self.mydict = Dictionary()
        self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(
            X, documents_columns=False),
                                              id2word=self.worddict)
        self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(
            X, documents_columns=False),
                                       num_topics=self.numtopics,
                                       passes=20,
                                       id2word=self.worddict)
        for i in df.iterrows():
            if i[1]['context'] == '':
                continue
            else:
                values = new_model[mydict.doc2bow(i[1]['context'].split())]
                for val in values:
                    if val[0] in user_dict[i[1].sender].keys():
                        if i[1].amt == '':
                            continue
                        user_dict[i[1].sender][val[0]] += val[1] * float(
                            i[1].amt)
                        continue
                    user_dict[i[1].sender][val[0]] = val[1]
        for i in user_dict.keys():
            norm_const = sum(user_dict[i].values())
            for j in user_dict[i].keys():
                user_dict[i][j] = user_dict[i][j] / norm_const

    def predict(self, text, username=''):
        topics = self.model[self.mydict.doc2bow(text.split())]
        doc_aff = np.zeros(self.numtopics)
        for i in topics:
            doc_aff[i[0]] = i[1]
        if username == '':
            returndict = {}
            for user in self.user_dict.keys():
                user_aff = np.array(self.user_dict[user].values())
                score = np.linalg.norm(user_aff - doc_aff)
                returndict[user] = score
            return returndict
        else:
            user_aff = np.array(self.user_dict[username].values())
            score = np.linalg.norm(user_aff - doc_aff)
            return (username, score)
예제 #17
0
def main(
    cuda,
    batch_size,
    epochs,
    top_words,
    testing_mode,
):
    print('Loading input data')
    # TODO fix relative paths
    input_train = np.load('data/train.txt.npy', encoding='bytes')
    input_val = np.load('data/test.txt.npy', encoding='bytes')
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    reverse_vocab = {vocab[word]: word for word in vocab}
    indexed_vocab = [reverse_vocab[index] for index in range(len(reverse_vocab))]
    data_train = np.array(
        [np.bincount(doc.astype('int'), minlength=len(vocab)) for doc in input_train if doc.sum() > 0]
    )
    data_val = np.array([np.bincount(doc.astype('int'), minlength=len(vocab)) for doc in input_val if doc.sum() > 0])
    corpus = Dense2Corpus(data_train, documents_columns=False)
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(autoencoder, epoch, lr, loss, perplexity):
        decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
        topics = [
            [reverse_vocab[item.item()] for item in topic]
            for topic in decoder_weight.topk(top_words, dim=0)[1].t()
        ]
        cm = CoherenceModel(
            topics=topics,
            corpus=corpus,
            dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
            coherence='u_mass'
        )
        coherence = cm.get_coherence()
        coherences = cm.get_coherence_per_topic()
        for index, topic in enumerate(topics):
            print(str(index) + ':' + str(coherences[index]) + ':' + ','.join(topic))
        print(coherence)
        writer.add_scalars('data/autoencoder', {
            'lr': lr,
            'loss': loss,
            'perplexity': perplexity,
            'coherence': coherence,
        }, global_step=epoch)

    ds_train = TensorDataset(torch.from_numpy(data_train).float())
    ds_val = TensorDataset(torch.from_numpy(data_val).float())
    autoencoder = ProdLDA(
        in_dimension=len(vocab),
        hidden1_dimension=100,
        hidden2_dimension=100,
        topics=50
    )
    if cuda:
        autoencoder.cuda()
    print('Training stage.')
    ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999))
    train(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=epochs,
        batch_size=batch_size,
        optimizer=ae_optimizer,
        update_callback=training_callback
    )
    autoencoder.eval()
    decoder_weight = autoencoder.decoder.linear.weight.detach().cpu()
    topics = [
        [reverse_vocab[item.item()] for item in topic]
        for topic in decoder_weight.topk(top_words, dim=0)[1].t()
    ]
    cm = CoherenceModel(
        topics=topics,
        corpus=corpus,
        dictionary=Dictionary.from_corpus(corpus, reverse_vocab),
        coherence='u_mass'
    )
    coherence = cm.get_coherence()
    coherences = cm.get_coherence_per_topic()
    for index, topic in enumerate(topics):
        print(str(index) + ':' + str(coherences[index]) + ':' + ','.join(topic))
    print(coherence)
    if not testing_mode:
        writer.add_embedding(
            autoencoder.encoder.linear1.weight.detach().cpu().t(),
            metadata=indexed_vocab,
            tag='feature_embeddings',
        )
    writer.close()
    if row['Content']:
        docs.append(
            str(row['Content']).encode(encoding='UTF-8', errors='strict'))


def vect2gensim(vectorizer, dtmatrix):
    # transform sparse matrix into gensim corpus and dictionary
    corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix,
                                                       documents_columns=False)
    dictionary = Dictionary.from_corpus(
        corpus_vect_gensim,
        id2word=dict(
            (id, word) for word, id in vectorizer.vocabulary_.items()))

    return (corpus_vect_gensim, dictionary)


# compute vector space with sklearn
vect = CountVectorizer(min_df=1, ngram_range=(1, 1), max_features=30000)
corpus_vect = vect.fit_transform(docs)

# transport to gensim
(corpus_vect_gensim, gensim_dict) = vect2gensim(vect, corpus_vect)

dictionary = Dictionary.from_corpus(
    corpus_vect_gensim,
    id2word=dict((id, word) for word, id in vect.vocabulary_.items()))

pd.to_pickle(vect, 'pickles/vocab_' + filename + '.pkl')
print(dictionary)