Пример #1
0
def test_preprocessing():

    root_folder = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/test/'

    pclean.file_parts_number=10
    pplsa.file_parts_number = 10
    pclean.file_dict = root_folder + 'dict/test_dict'
    pclean.source_texts = root_folder + 'extracted/*.txt'
    pclean.output_dir = root_folder + 'cleaned/'


    # Do cleansing on the data and turing it to bad-of-words model
    pclean.pre_pro()

    # Train using PLSA
    pplsa.folder = pclean.output_dir[:-1]
    pplsa.dict_path = pclean.file_dict
    pplsa.folder = pclean.output_dir[:-1]
    pplsa.main()
Пример #2
0
    def generate_topics(self):

        start_time_1 = time.time()

        pplsa.file_parts_number=10
        pclean.file_parts_number = 10
        pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict'
        pclean.source_texts = self.source_texts + self.unique_folder_naming + '*.txt'
        pclean.output_dir = self.output_dir + self.unique_folder_naming

        os.mkdir(pclean.output_dir)


        # Do cleansing on the data and turing it to bad-of-words model
        pclean.pre_pro()

        # Train using PLSA
        pplsa.topic_divider = 0
        pplsa.num_topics = 2
        pplsa.folder = pclean.output_dir[:-1]
        pplsa.dict_path = pclean.file_dict
        pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
        pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx'
        pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional'
        pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png'

        # Folder paths to delete
        self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH
        self.output_dir_stream = pclean.output_dir
        self.file_dict_stream = pclean.file_dict



        os.mkdir(pplsa.PLSA_PARAMETERS_PATH)

        pplsa.main()

        end_time_1 = time.time()

        print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4))
Пример #3
0
    def generate_topics_gensim(self,
                               num_topics,
                               passes,
                               chunksize,
                               update_every=0,
                               alpha='auto',
                               eta='auto',
                               decay=0.5,
                               offset=1.0,
                               eval_every=1,
                               iterations=50,
                               gamma_threshold=0.001,
                               minimum_probability=0.01,
                               random_state=None,
                               minimum_phi_value=0.01,
                               per_word_topics=True,
                               callbacks=None):

        start_time_1 = time.time()

        pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict'
        pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json'
        pclean.output_dir = self.output_dir + self.unique_folder_naming

        os.mkdir(pclean.output_dir)

        # Do cleansing on the data and turing it to bad-of-words model

        with open(
                self.lda_parameters_path + self.unique_folder_naming +
                'status.txt', 'w') as f:
            f.write('Preprocessing started.')

        pclean.pre_pro()

        with open(
                self.lda_parameters_path + self.unique_folder_naming +
                'status.txt', 'w') as f:
            f.write('Preprocessing finished. Topic analysis started.')

        with open(pclean.output_dir + 'cleaned.json', "r") as read_file:
            ret = json.load(read_file)

        data_lemmatized = []

        for k in ret:
            data_lemmatized.append(ret[k].splitlines())

        # Create Dictionary
        id2word = corpora.Dictionary(data_lemmatized)

        # Create Corpus
        texts = data_lemmatized

        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]

        # View
        # print(corpus[0:1])
        # print(id2word[1])

        self.lda_model = gensim.models.ldamodel.LdaModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            random_state=random_state,
            update_every=update_every,
            chunksize=chunksize,
            passes=passes,
            alpha=alpha,
            eta=eta,
            per_word_topics=per_word_topics,
            decay=decay,
            offset=offset,
            eval_every=eval_every,
            iterations=iterations,
            gamma_threshold=gamma_threshold,
            minimum_probability=minimum_probability,
            minimum_phi_value=minimum_phi_value,
            callbacks=callbacks)

        port_dict = porter_dictionary.porter_dictionary()

        topics = self.lda_model.show_topics(num_topics=num_topics,
                                            num_words=300,
                                            formatted=False)

        extracted_topics = []

        for topic in topics:
            a_topic = []
            for item in topic[1]:
                a_topic.append(item[0])
            extracted_topics.append(a_topic)

        port_dict.load_dict(self.dict_path + self.unique_folder_naming[:-1] +
                            '_dict')

        self.topics_destemmed = []

        for i in extracted_topics:
            destemmed = []
            for j in i:
                try:
                    destemmed.append(port_dict.dictionary[j][0])
                except:
                    logging.exception('message')
            self.topics_destemmed.append(destemmed)
        '''
Пример #4
0
    def generate_topics_json(self):

        start_time_1 = time.time()

        pplsa.file_parts_number = 10
        pclean.file_parts_number = 10
        pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict'
        pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json'
        pclean.output_dir = self.output_dir + self.unique_folder_naming

        os.mkdir(pclean.output_dir)

        # Do cleansing on the data and turing it to bad-of-words model

        with open(
                self.plsa_parameters_path + self.unique_folder_naming +
                'status.txt', 'w') as f:
            f.write('Preprocessing started.')

        pclean.pre_pro()

        with open(
                self.plsa_parameters_path + self.unique_folder_naming +
                'status.txt', 'w') as f:
            f.write('Preprocessing finished. Topic analysis started.')

        # Train using PLSA
        pplsa.topic_divider = self.topic_divider
        pplsa.num_topics = self.num_topics
        pplsa.maxiter2 = self.max_iter
        pplsa.beta = self.beta
        pplsa.folder = pclean.output_dir[:-1]
        pplsa.dict_path = pclean.file_dict
        pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
        pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx'
        pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional'
        pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png'

        # Folder paths to delete
        self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH
        self.output_dir_stream = pclean.output_dir
        self.file_dict_stream = pclean.file_dict

        try:
            os.mkdir(pplsa.PLSA_PARAMETERS_PATH)
        except:
            print(
                '-----------------------Folder exists-------------------------'
            )

        pplsa.main()

        end_time_1 = time.time()

        print('Total training time took:',
              round((end_time_1 - start_time_1) / 60, 4))

        with open(
                self.plsa_parameters_path + self.unique_folder_naming +
                'status.txt', 'w') as f:
            f.write('Topic analysis finished.\n')
            f.write(str(round((end_time_1 - start_time_1) / 60, 4)))