def train(): df = read_sample() document_classes = create_classes(df) word_classes = tokenize_classes(document_classes, False) negative_words = [ item for sublist in word_classes['NEG'] for item in sublist ] positive_words = [ item for sublist in word_classes['POS'] for item in sublist ] dictionary = create_dictionary([negative_words, positive_words]) negative_split = split_data(negative_words, (1, 0.0, 0.0)) positive_split = split_data(positive_words, (1, 0.0, 0.0)) negative_bow = dictionary.doc2bow(negative_split['train']) positive_bow = dictionary.doc2bow(positive_split['train']) total_negative = len(negative_split['train']) + len(negative_bow) total_positive = len(positive_split['train']) + len(positive_bow) negative_prob = np.log( len(negative_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) positive_prob = np.log( len(positive_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) negative_word_probs = {} for id, count in negative_bow: negative_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_negative), } negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)} positive_word_probs = {} for id, count in positive_bow: positive_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_positive), } positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)} model = { 'POS_PROB': positive_prob, 'NEG_PROB': negative_prob, 'COND_POS_PROBS': positive_word_probs, 'COND_NEG_PROBS': negative_word_probs, } with open("models/model.pkl", "wb") as output_file: pickle.dump(model, output_file) logging.info('Model saved to artifact model.pkl')
def train(): df = read_sample() data = df_to_list(df) data_lemmatized = tokenize(data) dictionary = create_dictionary(data_lemmatized) with open('data/models/dictionary.pkl', 'wb') as output_file: pickle.dump(dictionary, output_file) corpus = term_document_matrix(data_lemmatized, dictionary) lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) lda_model.save("data/models/lda_model.pkl")
from gensim.models import CoherenceModel from nltk.corpus import stopwords #logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) #### hacer referencias from src.features.utils import sent_to_words from src.features.tokenize import tokenize_classes from src.data.prepare_data import read_sample #######finalizan referencias stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use']) # Obtener datos data = read_sample() data_words = list(sent_to_words(data)) #agregar tokenize data_lemmatized = tokenize_classes(data_words) print(data_lemmatized) # Creamos diccionario id2word = corpora.Dictionary(data_lemmatized) print(id2word) #for key, value in id2word.items(): # print(key, value) # Create Corpus texts = data_lemmatized
def load_doc() -> pd.DataFrame: return read_sample()
def train(): df = read_sample() logging.info('Source data file read succesfully') document_classes = create_classes(df) logging.info('Documents split between different classes.') logging.info('Tokenization started. This will for sure take some time.') word_classes = tokenize_classes(document_classes, False) logging.info('Tokenization completed for all documents.') negative_words = [ item for sublist in word_classes['NEG'] for item in sublist ] positive_words = [ item for sublist in word_classes['POS'] for item in sublist ] dictionary = create_dictionary([negative_words, positive_words]) logging.info('Dictionary generated from all document words.') negative_split = split_data(negative_words) positive_split = split_data(positive_words) negative_bow = dictionary.doc2bow(negative_split['train']) positive_bow = dictionary.doc2bow(positive_split['train']) logging.info('Counts for bag of words for documents in all classes done') total_negative = len(negative_split['train']) + len(negative_bow) total_positive = len(positive_split['train']) + len(positive_bow) negative_prob = np.log( len(negative_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) positive_prob = np.log( len(positive_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) negative_word_probs = {} for id, count in negative_bow: negative_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_negative), } negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)} positive_word_probs = {} for id, count in positive_bow: positive_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_positive), } positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)} model = { 'POS_PROB': positive_prob, 'NEG_PROB': negative_prob, 'COND_POS_PROBS': positive_word_probs, 'COND_NEG_PROBS': negative_word_probs, } logging.info('Log probabilities for tokens in all classed computed') basePath = os.path.dirname(os.path.abspath(__file__)) with open(basePath + "/../../models/model.pkl", "wb") as output_file: pickle.dump(model, output_file) logging.info('Model saved to artifact model.pkl')