Exemplo n.º 1
0
def import_dict_and_normalize(name_database, name_collection, n_documents):
    print("Getting body subtitles from the database started ...")
    dbAdapter = dBAdapter.Database(name_database, name_collection)
    dbAdapter.open()
    listado = dbAdapter.selectGenerator_normalize_limit(n_documents)
    dic_subtitles = dbAdapter.selectDic_subtitles_limit(n_documents)
    dbAdapter.close()
    print("finalizada consulta")

    dic_subtitles2 = dic_subtitles

    generator_normalize = []
    for i in range(len(listado)):
        try:
            generator_normalize.append(listado[i].split(","))
        except:
            dic_subtitles2.pop(list(dic_subtitles.keys())[i])
            print("generator NonType------>" + str(i))

    dic_subtitles = dic_subtitles2

    for gn in generator_normalize:
        while True:
            try:
                gn.remove("")
            except ValueError:
                break
    print("Getting body subtitles from the database finished ...")
    n_documents = len(generator_normalize)

    return dic_subtitles, generator_normalize, n_documents
Exemplo n.º 2
0
def update_doc2vec():
    #------------------------------------------------------
    #UPDATE DDBB DOC2VEC
    #------------------------------------------------------
    [files, max_documents] = g.get_NameFiles()
    [dic_subtitles, data] = c.create_d2v_corpus(max_documents)
    subtitles = list(dic_subtitles.keys())
    data_s = []
    for d in data:
        data_s.append(','.join(d))
    print("updating the database")
    dbAdapter = dBAdapter.Database('tfg_project', 'tv_storage')
    dbAdapter.open()
    for i in range(len(data_s)):
        dbAdapter.update_doc2vec(subtitles[i], data_s[i])
    dbAdapter.close()
Exemplo n.º 3
0
def import_doc2vec_list(name_database, name_collection, n_documents):
    dbAdapter = dBAdapter.Database(name_database, name_collection)
    dbAdapter.open()
    print("Getting doc2vec list started...")
    list_s = dbAdapter.select_dataDoc2Vec(n_documents)
    print("Getting doc2vec list finished...")
    dbAdapter.close()
    data = []
    for l in list_s:
        data.append(l.split(","))
    for d in data:
        while True:
            try:
                d.remove("")
                d.remove(" ")
            except ValueError:
                break

    return data, n_documents
Exemplo n.º 4
0
def doc2vec_module(database,
                   collection,
                   n_documents=300,
                   vector_size=50,
                   max_clusters=200):

    #logs
    file_logs = config['LOGS']['doc2vec_logs']
    name_log_file = datetime.now().strftime(file_logs + '_%d_%m_%Y.log')
    logging.basicConfig(
        filename=name_log_file,
        level=logging.WARNING,
        format="%(asctime)s:%(filename)s:%(lineno)d:%(levelname)s:%(message)s")

    #end config variables-----------------------------------------------------------------------

    #import from DDBB dic_subtitles and data doc2vec --------------------------
    print("Getting body subtitles from the database started ...")
    data = []
    dbAdapter = dBAdapter.Database(database, collection)
    dbAdapter.open()
    dic_subtitles = dbAdapter.selectDic_subtitles_limit(n_documents)
    subtitles = list(dic_subtitles.keys())
    list_s = dbAdapter.select_dataDoc2Vec(n_documents)
    print("Getting body subtitles from the database finished ...")
    data = []
    for l in list_s:
        data.append(l.split(","))
    for d in data:
        while True:
            try:
                d.remove("")
                d.remove(" ")
            except ValueError:
                break

    #--------------------------------------------------------------------------

    # Create the tagged document needed for Doc2Vec
    def create_tagged_document(list_of_list_of_words):
        for i, list_of_words in enumerate(list_of_list_of_words):
            yield gensim.models.doc2vec.TaggedDocument(list_of_words,
                                                       [subtitles[i]])

    train_data = list(create_tagged_document(data))

    print("starting with doc2vec....")
    model = gensim.models.doc2vec.Doc2Vec(vector_size=vector_size,
                                          min_count=2,
                                          epochs=40)

    # Build the Volabulary
    model.build_vocab(train_data)

    # Train the Doc2Vec model
    model.train(train_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)

    list_vec_doc2vec = [model.docvecs[subtitle] for subtitle in subtitles]

    arr_vec_doc2vec = np.stack(list_vec_doc2vec, axis=0)

    return list_vec_doc2vec, arr_vec_doc2vec, train_data, model
Exemplo n.º 5
0
@author: cvicentm
"""
"""IN THIS PROGRAM THE CODE DOC2VEC WILL BE EXECUTED """

from modules.doc2vec import doc2vec as d2v
from modules.classificator import k_means_doc2vec as k
from modules.sql import dBAdapter
import matplotlib.pyplot as plt
import timeit
import numpy as np
import pickle

database = 'tfg_project'
collection = 'tv_storage'
dbAdapter = dBAdapter.Database(database, collection)
dbAdapter.open()
max_documents = dbAdapter.get_maxDocuments()
dbAdapter.close()

max_clusters = 20
n_documents = 200

[list_vec_doc2vec, arr_vec_doc2vec, train_data,
 model] = d2v.doc2vec_module(database,
                             collection,
                             n_documents=n_documents,
                             vector_size=50,
                             max_clusters=max_clusters)

#Para saber las palabras más parecidas con el modelo DM
Exemplo n.º 6
0
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 11:44:45 2020

@author: cvicentm
"""

#------------------------------------------------------
#GET DDBB update channels into database
#------------------------------------------------------
from modules.sql import dBAdapter
dbAdapter = dBAdapter.Database()
dbAdapter.open()
dic_subtitles = dict(dbAdapter.selectAll())
dbAdapter.close()
print("finalizada consulta")

import modules.variables as v
channels = v.CHANNELS

channel_column = [(subtitle, channel)
                  for subtitle in list(dic_subtitles.keys())
                  for channel in channels if subtitle.find(channel) != -1]

from modules.sql import dBAdapter
dbAdapter = dBAdapter.Database()
dbAdapter.open()
for ch in channel_column:
    dbAdapter.update_channel(ch[0], ch[1])
dbAdapter.close()
print("finalizada consulta")
Exemplo n.º 7
0
Created on Wed Jul  8 18:25:36 2020

@author: cvicentm
"""

# pip install pymongo
from pymongo import MongoClient
from modules.sql import dBAdapter
import pandas as pd
import json
from tqdm import tqdm
import logging

from modules.sql import dBAdapter

dbAdapter = dBAdapter.Database('tfg_project','tv_storage')
dbAdapter.open()
result = list(dbAdapter.selectDict())
result2 = dbAdapter.get_maxDocuments()
result3 = list(dbAdapter.selectRowByName('antena3_2019 09 14_morning_new'))
result4 = list(dbAdapter.selectDic_subtitles_limit(10))
result5 = list(dbAdapter.select_dataDoc2Vec(40))
dbAdapter.update_doc2vec("1_spa_2019 07 21_morning_new",'hola')
dbAdapter.close()

def mongo_q_doc2vec_to_list(mongo_q_doc2vec):
    result = []
    for mq in mongo_q_doc2vec:
        result.append(mq['doc2vec'])
    return result
list5=mongo_q_doc2vec_to_list(result5)
Exemplo n.º 8
0
def max_documents(name_database, name_collection):
    dbAdapter = dBAdapter.Database(name_database, name_collection)
    dbAdapter.open()
    max_documents = dbAdapter.get_maxDocuments()
    dbAdapter.close()
    return max_documents
Exemplo n.º 9
0
def LDAmodel(n_topics,
             n_documents,
             n_printedDocuments,
             name_database,
             name_collection,
             step=1,
             start=1):
    #Tengo que escribir para que sirve cada cosa que hace el gensim

    #import from DDBB dic_subtitles and generator normalize--------------------
    """
    print("Getting body subtitles from the database started ...")
    dbAdapter= dBAdapter.Database()
    dbAdapter.open()
    dic_subtitles = dict(dbAdapter.selectDic_subtitles_limit(n_documents))
    gn = dbAdapter.selectGenerator_normalize_limit(n_documents)
    generator_normalize = [ast.literal_eval(gni[0]) for gni in gn]
    dbAdapter.close()
    print("Getting body subtitles from the database finished ...")
    """
    print("Getting body subtitles from the database started ...")
    dbAdapter = dBAdapter.Database(name_database, name_collection)
    dbAdapter.open()
    listado = dbAdapter.selectGenerator_normalize_limit(n_documents)
    dic_subtitles = dbAdapter.selectDic_subtitles_limit(n_documents)
    dbAdapter.close()
    print("finalizada consulta")

    dic_subtitles2 = dic_subtitles

    generator_normalize = []
    for i in range(len(listado)):
        try:
            generator_normalize.append(listado[i].split(","))
        except:
            dic_subtitles2.pop(list(dic_subtitles.keys())[i])
            print("generator NonType------>" + str(i))

    dic_subtitles = dic_subtitles2

    for gn in generator_normalize:
        while True:
            try:
                gn.remove("")
            except ValueError:
                break
    print("Getting body subtitles from the database finished ...")
    n_documents = len(generator_normalize)
    #--------------------------------------------------------------------------
    coherencemodelArray = []

    if not os.path.exists('D:\\caleb\\pickle\\' + str(n_documents)):
        os.makedirs('D:\\caleb\\pickle\\' + str(n_documents))
    try:

        id2word = pickle.load(
            open(
                "D:\\caleb\\pickle\\" + str(n_documents) + "\id2word_" +
                str(n_documents) + ".txt", "rb"))
        corpus = pickle.load(
            open(
                "D:\\caleb\\pickle\\" + str(n_documents) + "\corpus_" +
                str(n_documents) + ".txt", "rb"))
        print("generator_normalize, id2word and corpus has been imported")

    except IOError:

        print("Proccess of creating corpus and the dictionary has started")
        #this is creating a dictionary with all de different words of the document
        id2word = corpora.Dictionary(generator_normalize)
        file_id2word = "D:\\caleb\\pickle\\" + str(
            n_documents) + "\id2word_" + str(n_documents) + '.txt'
        pickle.dump(id2word, open(file_id2word, 'wb'))
        # Create Corpus: Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in generator_normalize]
        file_corpus = "D:\\caleb\\pickle\\" + str(
            n_documents) + "\corpus_" + str(n_documents) + '.txt'
        pickle.dump(corpus, open(file_corpus, 'wb'))

        print("Proccess of creating corpus and the dictionary has ended")

    for n_topics in chain(range(1, 2), range(2, 18, 2), range(18, 200, 8)):
        file_lda_model = 'D:\\caleb\\pickle\\' + str(
            n_documents) + '\lda_model_' + str(n_topics) + '_' + str(
                n_documents) + '.sav'
        try:

            f = open(file_lda_model, 'rb')
            lda = pickle.load(f)
            print("The model has been trained previously with..." +
                  str(n_topics) + " n_topics")
            coherencemodel = CoherenceModel(model=lda,
                                            corpus=corpus,
                                            dictionary=id2word,
                                            coherence='u_mass')
            coherencemodel_cv = CoherenceModel(model=lda,
                                               texts=list(generator_normalize),
                                               dictionary=id2word,
                                               coherence='c_v')
            coherencemodel_c_uci = CoherenceModel(
                model=lda,
                texts=list(generator_normalize),
                dictionary=id2word,
                coherence='c_uci')
            file_coherence_cv = 'D:\\caleb\\pickle\\' + str(
                n_documents) + '\cv_' + str(n_topics) + '_' + str(
                    n_documents) + '.sav'
            pickle.dump(coherencemodel_cv, open(file_coherence_cv, 'wb'))
            file_coherence_c_uci = 'D:\\caleb\\pickle\\' + str(
                n_documents) + '\c_uci_' + str(n_topics) + '_' + str(
                    n_documents) + '.sav'
            pickle.dump(coherencemodel_c_uci, open(file_coherence_c_uci, 'wb'))
            #CoherenceModel(model=goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')
            #coherencemodel = CoherenceModel(model=lda, texts=list(generator_normalize), dictionary=id2word, coherence='c_v')
            coherence_values = coherencemodel.get_coherence()
            coherencemodelArray.append(coherence_values)

        except IOError:

            print("FINALLY: the LDA model has to be trained for " +
                  str(n_documents) + " n_documents and " + str(n_topics) +
                  " n_topics, trained")

            tic_all_processing = timeit.default_timer()
            #function based on : https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#13viewthetopicsinldamodel
            [coherence_values, coherencemodel_cv,
             coherencemodel_c_uci] = training_model(n_documents, n_topics,
                                                    id2word, corpus,
                                                    generator_normalize)
            coherencemodelArray.append(coherence_values)
            toc_all_processing = timeit.default_timer()
            try:
                time_lda_fit = str(
                    datetime.timedelta(seconds=int(
                        float(toc_all_processing - tic_all_processing))))
                print("The process of training lda model with " +
                      str(n_topics) + " n_topics and " + str(n_documents) +
                      " n_documents, has taken " + time_lda_fit + " seconds")
            except AttributeError:
                print("The process of training lda model with " +
                      str(n_topics) + " n_topics and " + str(n_documents) +
                      " n_documents, has ended")

            file_coherence_cv = 'D:\\caleb\\pickle\\' + str(
                n_documents) + '\cv_' + str(n_topics) + '_' + str(
                    n_documents) + '.sav'
            pickle.dump(coherencemodel_cv, open(file_coherence_cv, 'wb'))
            file_coherence_c_uci = 'D:\\caleb\\pickle\\' + str(
                n_documents) + '\c_uci_' + str(n_topics) + '_' + str(
                    n_documents) + '.sav'
            pickle.dump(coherencemodel_c_uci, open(file_coherence_c_uci, 'wb'))

    coherencemodelArray = list(coherencemodelArray)
    file_coherence_umass = 'D:\\caleb\\pickle\\coherencemodelarray.sav'
    pickle.dump(coherencemodelArray, open(file_coherence_umass, 'wb'))
    x = list(chain(range(1, 2), range(2, 18, 2), range(18, 200, 8)))
    #n_topics+1 because has to have the same weight than coherencemodelArray
    score = savgol_filter(coherencemodelArray, 11, 3)
    plt.plot(x, score)
    plt.xlabel("N_Topics")
    plt.ylabel("Coherence")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

    best_n_topic = coherencemodelArray.index(min(coherencemodelArray)) + start
    print("el mejor modelo es: " + 'pickle' + str(n_documents) +
          '\lda_model_' + str(best_n_topic) + '_' + str(n_documents) + '.sav')
    f = open(
        'D:\\caleb\\pickle\\' + str(n_documents) + '\lda_model_' +
        str(best_n_topic) + '_' + str(n_documents) + '.sav', 'rb')
    lda = pickle.load(f)
    document_per_topic = list(lda.get_document_topics(corpus))
    """
    corp_cur = corpus[1]
    topic_percs, wordid_topics, wordid_phivalues = lda[corp_cur]
    print(wordid_topics)
    """
    array_topic_per_document = np.zeros(
        (len(document_per_topic), best_n_topic))

    for i in range(len(document_per_topic)):
        for j in range(len(document_per_topic[i])):
            try:
                array_topic_per_document[i][document_per_topic[i][j]
                                            [0]] = document_per_topic[i][j][1]
            except IndexError as index:
                #EN ESTE LOG sería necesario ponerle, cual ha sido el subtítulo que ha dado problemas e identifcar porque
                logging.warning(
                    "array_topic_per_document out of range in position n_document: "
                    + str(i) + " and topic: " + str(j) + " \n")
    #NUMBER OF DOCUMENTs to print results on word

    return array_topic_per_document, best_n_topic, dic_subtitles, lda, generator_normalize, corpus, id2word, coherencemodelArray
Exemplo n.º 10
0
#resumen de topicos----------------------------------------------------
"""IN THIS CODE WE WILL EXECUTE THE CODE RELATED TO LDA"""

start_topics = 1
N_TOPICS = 2
step = 2

#este parámetro no se puede añadir a mano
n_printedDocuments = 20
max_clusters = 200

from modules.sql import dBAdapter
name_database = 'tfg_project'
name_collection = 'tv_storage'
dbAdapter = dBAdapter.Database(name_database, name_collection)
dbAdapter.open()
max_documents = dbAdapter.get_maxDocuments()
dbAdapter.close()

#if we want to change the number of documents to analized we can do it here
n_documents = max_documents

#PROGRAM-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[
    array_topic_per_document, best_n_topic, dic_subtitles, lda,
    generator_normalize, corpus, id2word, coherencemodelArray
] = LDAmodel(N_TOPICS,
             n_documents,
             n_printedDocuments,