示例#1
0
def job():
    """ Function to be scheduling. """
    current_month = 3  # datetime.now().month
    current_year = 2017  # datetime.now().year

    # if is_last_month(current_year, current_month):
    message_history_list = Repository.get_chat_message_history(
        month=current_month, year=current_year)

    if message_history_list:
        merchant_name = message_history_list[0].name

        # cleaning chat text
        results = preprocessing.cleaning(message_history_list)
        logger.info(f'Preprocessing result {len(results)} items')

        # build documents
        documents = [result.content.split() for result in results]
        documents = Preprocessing.identify_phrase(documents)
        dictionary = Dictionary(documents)
        logger.info(f'Preprocessing unique tokens: {len(dictionary)}')

        # build bag of words
        bow_corpus = [dictionary.doc2bow(document) for document in documents]

        # calculate tf idf
        tf_idf = TfidfModel(bow_corpus)
        corpus_tf_idf = tf_idf[bow_corpus]

        # find highest coherence score
        lda_models_with_coherence_score = {}
        for index in range(1, NUM_TOPICS + 1):
            lda_model = LdaModel(corpus=corpus_tf_idf,
                                 num_topics=index,
                                 id2word=dictionary)

            coherence_model_lda = CoherenceModel(model=lda_model,
                                                 texts=documents,
                                                 corpus=corpus_tf_idf,
                                                 coherence='c_v')
            coherence_score = coherence_model_lda.get_coherence()
            lda_models_with_coherence_score[coherence_score] = lda_model
            logger.info(f'Coherence score: {coherence_score}')

        # running the best lda model based on highest coherence score
        lda_model = lda_models_with_coherence_score[max(
            lda_models_with_coherence_score)]

        # save into DB
        for cluster, topic_term in lda_model.show_topics(-1,
                                                         num_words=20,
                                                         formatted=False):
            for topic in topic_term:
                logger.info(f'Topic Cluster: {cluster + 1}, '
                            f'Word: {topic[0]}, '
                            f'Score: {topic[1]}, '
                            f'Merchant: {merchant_name}, '
                            f'Year: {current_year}, '
                            f'Month: {current_month}')
                repository.insert_into_online_shop(topic_cluster=cluster + 1,
                                                   word=topic[0],
                                                   score=topic[1],
                                                   merchant_name=merchant_name,
                                                   year=current_year,
                                                   month=current_month)