예제 #1
0
from lda2vec import utils, b_model
import numpy as np
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Path to preprocessed data
data_path = "data/clean_data"
# Whether or not to load saved embeddings file
load_embeds = True

# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix,
 bias_idxes) = utils.load_preprocessed_data(data_path,
                                            load_embed_matrix=load_embeds,
                                            load_bias_idxes=True)

bias_words = ['privacy', 'anonymity', 'confidentiality', 'disclosure']
bias_idxes = [word_to_idx[word] for word in bias_words]

# Number of unique documents
num_docs = len(np.unique(doc_ids))
# Number of unique words in vocabulary (int)
vocab_size = embed_matrix.shape[0]
# Embed layer dimension size
# If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
# Number of topics to cluster into
num_topics = 20
# Number of topics to bias
model_dir = "tests/webhose_50k/model/v3"

MODEL_RESTORE = False

if os.path.exists("{}/model.ckpt.meta".format(model_dir)):
    MODEL_RESTORE = True

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Whether or not to load saved embeddings file
load_embeds = True

# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids,
 embed_matrix) = utils.load_preprocessed_data(clean_data_dir,
                                              load_embed_matrix=load_embeds)

# Number of unique documents
num_docs = doc_ids.max() + 1
# Number of unique words in vocabulary (int)
vocab_size = len(freqs)
# Embed layer dimension size
# If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
# Number of topics to cluster into
num_topics = 15
# Amount of iterations over entire dataset
num_epochs = 5
# Batch size - Increase/decrease depending on memory usage
batch_size = 8192
# Epoch that we want to "switch on" LDA loss
예제 #3
0
from lda2vec import utils, s_model
import numpy as np
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Path to preprocessed data
data_path = "data/clean_data"
# Whether or not to load saved embeddings file
load_embeds = True

# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids,
 embed_matrix) = utils.load_preprocessed_data(data_path,
                                              load_embed_matrix=load_embeds)

seed_words = ['privacy', 'anonymity', 'confidentiality', 'disclosure']
seed_idxes = [word_to_idx[word] for word in seed_words if word in word_to_idx]

base_seed_idxes = [word_to_idx[word] for word in seed_words]
seed_idxes = [[base_seed_idxes[0], base_seed_idxes[1]],
              [base_seed_idxes[0], base_seed_idxes[2]],
              [base_seed_idxes[0], base_seed_idxes[3]], [base_seed_idxes[0]],
              [base_seed_idxes[2]]]

# Number of unique documents
num_docs = len(np.unique(doc_ids))
# Number of unique words in vocabulary (int)
vocab_size = embed_matrix.shape[0]
# Embed layer dimension size
예제 #4
0
from lda2vec import utils, s_model
import numpy as np
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Path to preprocessed data
data_path  = "data/clean_data"
# Whether or not to load saved embeddings file
load_embeds = True

# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(
      data_path, load_embed_matrix=load_embeds)

seed_words = ['privacy', 'anonymity','confidentiality','disclosure']
seed_idxes = [word_to_idx[word] for word in seed_words if word in word_to_idx]

base_seed_idxes = [word_to_idx[word] for word in seed_words]
seed_idxes = [[base_seed_idxes[0], base_seed_idxes[1]],
              [base_seed_idxes[0], base_seed_idxes[2]],
              [base_seed_idxes[0], base_seed_idxes[3]],
              [base_seed_idxes[0]]
              [base_seed_idxes[2]]

# Number of unique documents
num_docs = len(np.unique(doc_ids))
# Number of unique words in vocabulary (int)
vocab_size = embed_matrix.shape[0] 
# Embed layer dimension size
# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove(EMBEDDING_DIR + "/" + "glove.6B.100d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids,
 embed_matrix) = utils.load_preprocessed_data(CLEAN_DATA_DIR,
                                              load_embed_matrix=load_embeds)

# Number of unique documents
num_docs = doc_ids.max() + 1
# Number of unique words in vocabulary (int)
vocab_size = len(freqs)
# Embed layer dimension size
# If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
# Number of topics to cluster into
num_topics = 20
# Amount of iterations over entire dataset
num_epochs = 200
# Batch size - Increase/decrease depending on memory usage
batch_size = 4096
# Epoch that we want to "switch on" LDA loss
from lda2vec import utils, model

data_path = "data"
run_name = "my_run"
num_topics = 20
num_epochs = 20

(idx_to_word, word_to_idx, freqs, embed_matrix, pivot_ids, target_ids, doc_ids,
 num_docs, vocab_size,
 embed_size) = utils.load_preprocessed_data(data_path, run_name)

m = model(num_docs,
          vocab_size,
          num_topics=num_topics,
          embedding_size=embed_size,
          load_embeds=True,
          pretrained_embeddings=embed_matrix,
          freqs=freqs)

m.train(pivot_ids,
        target_ids,
        doc_ids,
        len(pivot_ids),
        num_epochs,
        idx_to_word=idx_to_word,
        switch_loss_epoch=5)

utils.generate_ldavis_data(data_path, run_name, m, idx_to_word, freqs,
                           vocab_size)