def get_topic_extraction(self, passed_args): passed_args, data_args, dictionary_args, model_args, version_args, request_args = ExeParams.get_parameters( passed_args) Advisor.set_data_folder_path(data_args[DataParams.data_folder_path]) processed_data = self._get_processed_data(data_args, version_args) all_models_args_values = ModelParams.get_possible_model_params_values() for requested_model_type in request_args[ RequestParams.requested_models]: for lang in processed_data: if lang in request_args[RequestParams.requested_langs]: # ModelParams.set_model_params_value(model_args) logging.info("- Get Topic extraction for '%s' language" % lang) for test_param in all_models_args_values: metrics = self._get_requested_models_metrics( requested_model_type, lang, processed_data[lang], dictionary_args, all_models_args_values[test_param], version_args, request_args, test_param) metrics = self._make_param_metrics_ready_to_plot( metrics) self.comparative_view.plot_metrics( metrics, test_param, lang, requested_model_type, version_args[VersionifyParams.data_version], version_args[VersionifyParams.dictionary_version], version_args[VersionifyParams.model_version]) return
def get_train_data_ready_to_work( self, data_file_name: str, data_file_extension: str, data_file_type: str, ) -> dict: """ :param data_file_name: data file name -> {data_file_name}-train.{data_file_extension} :param data_file_extension: data file extension :param data_file_type: only three types are supported["CommonCrawl", "Json", "SemiJson"] (path-to-data-folder/(data-file-name)-(train/test).data-extension) :return: dic of languages as keys, and their values are list of that language's documents; and each document is one string that holds all web page's text """ data_file_path = Advisor.get_data_folders_file_path( data_file_name, data_file_extension) ready_to_train_data = dict() logging.info("--- Get data file content") text_data = self._get_raw_data_from_path_file(data_file_type, data_file_path) logging.info("--- Getting pages's code and language") for index, page in enumerate(text_data): page = BeautifulSoup(page, features="html.parser") text, lang = self._get_text_data_from_page(page) if lang not in ready_to_train_data: ready_to_train_data[lang] = list() ready_to_train_data[lang].append(text) logging.info("--- Got pages's text and language") logging.info("-- Data is ready for further processes") return ready_to_train_data
def __read_data_meta_file(): meta_file_path = Advisor.get_data_file_meta_path() if path.exists(meta_file_path): with open(meta_file_path, "r") as json_file: meta = json.load(json_file) json_file.close() return meta return None
def write_model_evaluation_metrics(self, lang: str, data_version, dictionary_version, model_version, param_name: str, param_version: int, metrics: dict, model_parameters: dict): model_evaluation_file_path = Advisor.get_model_type_folders_file_path( lang, data_version, dictionary_version, model_version, param_name, param_version, self.model_type, "evaluation") model_version_file_path = Advisor.get_model_type_folders_file_path( lang, data_version, dictionary_version, model_version, param_name, param_version, self.model_type, "meta.json") with open(model_evaluation_file_path, "w") as json_file: json.dump(metrics, json_file, indent=4) json_file.close() with open(model_version_file_path, "w") as json_file: json.dump(model_parameters, json_file, indent=4) json_file.close() return
def get_topics_words_and_their_tfidf_of_them_over_docs( self, model, processed_data, model_type): topics = model.show_topics(formatted=False, num_topics=-1, num_words=30) words_tfidf = self._get_words_tfidf(topics, processed_data) max_doc = len(processed_data) document_id_range = range(0, max_doc + 1, int( max_doc / 10)) if max_doc > 10 else range(0, max_doc + 1) for topic_id, topic in topics: file_path = Advisor.get_topic_folders_file_path_from_topic_version( self.topic_version_path, model_type, topic_id, "Words TFiDF", "png") words_no = len(topic) # fig, axes = pyplot.subplots(words_no, 1, figsize=(2 * 2.2, min([(len(processed_data) + 1) * 5.0, 2 ** 16]))) fig, axes = pyplot.subplots(words_no, 1) fig.suptitle("TFiDF of Topic '%d' over Docs" % topic_id) if words_no > 1: axes = axes.flatten() else: axes = [axes] ax_no = 0 for word, _ in topic: ax = axes[ax_no] ax_no += 1 data = list() word_data = words_tfidf[word] for doc_id in word_data: data.append([word, doc_id, word_data[doc_id]]) data = DataFrame(data, columns=["Word", "Document_No", "TFiDF"]) ax.plot('Document_No', 'TFiDF', data=data, marker='o', color=self._get_random_color(), label='Word : %s' % word, dashes=[6, 2]) ax.legend() ax.set_ylabel("TFiDF of Words") ax.set_xlabel("Documents") ax.set_title("Word %s" % word) ax.set_xticks(document_id_range) ax.set_xticklabels([ 'Document %d' % document for document in document_id_range ], rotation=30, horizontalalignment='right', fontsize=8) ax.grid(True) fig.savefig(file_path) pyplot.close(fig) return
def __write_processed_data(data_file_name: str, processed_data: dict, version: int): """ :param data_file_name: processed data's file name :param processed_data: the data that we want to write it down :param version: the version of data """ for lang in processed_data: processed_data_file_path = Advisor.get_data_version_folders_file_path(lang, version, data_file_name, "json") with open(processed_data_file_path, 'w') as json_file: json.dump(processed_data[lang], json_file) json_file.close() return
def __write_meta_data(language_list: list, tags: dict, data_version: int, lang_length: dict): data_file_meta_path = Advisor.get_data_file_meta_path() data_file_meta_content = {"languages": language_list, "lang_length": lang_length} data_process_version_meta_content = { "tags": tags, "token_ validation": {"must": ["alpha"], "must Not": ["stop_word", "space", "bracket", "currency", "url", "email", "number", "verb"]}} with open(data_file_meta_path, "w") as json_file: json.dump(data_file_meta_content, json_file, indent=4) json_file.close() for lang in language_list: data_version_meta_path = Advisor.get_data_version_folders_file_path(lang, data_version, "data-process-mata", "json") with open(data_version_meta_path, "w") as json_file: json.dump(data_process_version_meta_content, json_file, indent=4) json_file.close() return
def _contribution_of_dominate_topics_in_docs( self, contribution_of_dominate_topic_in_docs: dict, model_type: str, nth_topic: int, doc_no: int): file_path = Advisor.get_visualization_file_path_from_topic_version( self.topic_version_path, model_type, "Contribution_of_" "%d_dominate_topics_in_docs" % nth_topic, "png") plot_no = len(contribution_of_dominate_topic_in_docs) document_id_range = range(0, doc_no + 1, int( doc_no / 10)) if doc_no > 10 else range(0, doc_no + 1, 1) fig, axes = pyplot.subplots(plot_no, 1, figsize=(2 * 2.0, (plot_no + 1) * 5.0)) if plot_no > 1: axes = axes.flatten() else: axes = [axes] ax_no = 0 fig.suptitle("Contribution Of '%dth' Dominate Topics In Docs" % nth_topic) docs = set() for topic_id in contribution_of_dominate_topic_in_docs: ax = axes[ax_no] ax_no += 1 label = "%d" % topic_id data = DataFrame( contribution_of_dominate_topic_in_docs[topic_id], columns=["Document_No", "Percentage_of_Contribution"]) ax.plot('Document_No', 'Percentage_of_Contribution', data=data, marker='o', color=self._get_random_color(), label=label, dashes=[6, 2]) docs.update(data["Document_No"].tolist()) ax.legend() ax.set_ylabel("Contribution of Topic") ax.set_title("Topic %d" % topic_id) ax.set_xticks(document_id_range) ax.set_xticklabels( ['Document %d' % document for document in document_id_range], rotation=30, horizontalalignment='right', fontsize=8) ax.grid(True) fig.savefig(file_path) pyplot.close(fig) return
def _contribution_of_each_dominate_topic_in_docs( self, contribution_of_dominate_topic_in_docs: dict, doc_no, model_type: str, nth_topic: int): document_id_range = range(0, doc_no + 1, int( doc_no / 10)) if doc_no > 10 else range(0, doc_no + 1, 1) for topic_id in contribution_of_dominate_topic_in_docs: file_path = Advisor.get_topic_folders_file_path_from_topic_version( self.topic_version_path, model_type, int(topic_id), "Contribution_of_" "'%d'th_dominate_topic_in_docs" % nth_topic, "png") fig, ax = pyplot.subplots() ax.set_title("Contribution Of Topic '%d' ") data = DataFrame( contribution_of_dominate_topic_in_docs[topic_id], columns=["Document_No", "Percentage_of_Contribution"]) ax.plot('Document_No', 'Percentage_of_Contribution', data=data, marker='o', color=self._get_random_color(), dashes=[6, 2]) ax.set_ylabel("Contribution") ax.set_xlabel("Documents") ax.set_title("Contribution of Topic %d Over Documents" % topic_id) ax.set_xticks(document_id_range) ax.set_xticklabels( ['Document %d' % document for document in document_id_range], rotation=45, horizontalalignment='right', fontsize=8) text = list(data.Percentage_of_Contribution) d_mean = round(mean(text)) d_median = round(median(text)) d_std = round(std(text)) d_one_percent = round(quantile(text, q=0.01)) d_ninety_nine_percent = round(quantile(text, q=0.99)) d_text = "Mean : {}\nMedian : {}\nStdev: {}\n1%ile : {}\n99%ile : {}".format( d_mean, d_median, d_std, d_one_percent, d_ninety_nine_percent) ax.text(0.9, 0.98, d_text, transform=ax.transAxes, bbox=dict(fc="none"), color='purple') ax.grid(True) fig.savefig(file_path) pyplot.close(fig) return
def _plot_number_of_docs_in_each_dominate_topic( self, number_of_docs_in_each_dominate_topic: list, model_type, nth_topic): file_path = Advisor.get_visualization_file_path_from_topic_version( self.topic_version_path, model_type, "Number_of_docs_in_each_" "'%d'th_dominate_topic" % nth_topic, "png") divide_no = 10 number_of_docs_in_each_dominate_topic = DataFrame( number_of_docs_in_each_dominate_topic, columns=["Topic_id", "No_of_Docs"]) number_of_docs_in_each_dominate_topic = number_of_docs_in_each_dominate_topic.sort_values( "No_of_Docs") topic_no = len(number_of_docs_in_each_dominate_topic) plot_no = int(topic_no / divide_no) if topic_no % divide_no == 0 else int( topic_no / divide_no) + 1 fig, axes = pyplot.subplots(plot_no, 1) fig.set_size_inches(divide_no * 2.0, plot_no * 8.0) if plot_no > 1: axes = axes.flatten() else: axes = [axes] start = 0 for i in range(plot_no): ax = axes[i] plot_data = number_of_docs_in_each_dominate_topic[start:start + divide_no] ax.bar(x='Topic_id', height="No_of_Docs", data=plot_data, width=0.5, alpha=0.3) dominate_topic_range = plot_data.Topic_id.to_list() ax.set_ylabel('Document Count Percentage') ax.set_xlabel('Topics') ax.set_xticks(dominate_topic_range) ax.set_xticklabels( ['Topic %d' % topic for topic in dominate_topic_range], rotation=30, horizontalalignment='right', fontsize=8) self._set_bar_plot_text(ax) start += divide_no fig.savefig(file_path) fig.suptitle('Number of Documents in Percentage for each Topic', fontsize=14) pyplot.close(fig) return
def __init__(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int, number_of_decimal_digits: int = 5, max_colwidth: int = 100): self.topic_version_path = Advisor.get_param_version_folder_path( lang, data_version, dictionary_version, model_version, param_name, param_version) self.number_of_decimal_digits = number_of_decimal_digits options.display.max_colwidth = max_colwidth return
def get_dictionary(self, lang, data_version, dictionary_version, no_above, no_below, n_most_frequent, language_processed_data: list): logging.info("--- Getting dictionary") if self.dictionary is None: dictionary_file_path = Advisor.get_dictionary_version_folder_file_path( lang, data_version, dictionary_version, self.file_types[0][0], self.file_types[0][1]) if path.exists(dictionary_file_path): logging.info("---- Dictionary was created before") self.dictionary = Dictionary.load(dictionary_file_path) else: self.set_dictionary(language_processed_data, no_below, no_above, n_most_frequent, dictionary_file_path) logging.info("--- Dictionary captured") return
def get_corpus(self, lang, data_version, dictionary_version, language_processed_data: list = None): logging.info("--- Getting corpus") if self.corpus is None: corpus_file_path = Advisor.get_dictionary_version_folder_file_path( lang, data_version, dictionary_version, self.file_types[1][0], self.file_types[1][1]) if path.exists(corpus_file_path): logging.info("---- Corpus was created before") self.corpus = list(MmCorpus(corpus_file_path)) else: self.set_corpus(language_processed_data, corpus_file_path) logging.info("--- Corpus captured") return
def get_model(self, lang, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int, language_processed_data: list, model_view: bool): logging.info("--- Getting LDA model") if self.model is None: model_file_path = Advisor.get_model_type_folders_file_path(lang, data_version, dictionary_version, model_version, param_name, param_version, self.model_type, "LDA-model") if path.exists(model_file_path): logging.info("---- LDA model was crated before") self.model = LdaModel.load(model_file_path) else: self.set_model(lang, data_version, dictionary_version, model_version, param_name, param_version, model_file_path, language_processed_data) logging.info("--- LDA model captured") if model_view: self.visualization.get_model_visualizations(self.model_type, self.model, self.essentials.corpus, language_processed_data) return self.model
def __read_processed_data(data_file_name: str, version: int): """ :param data_file_name: processed data's file name :param version: the data that we want to read it :return: (dict format) the processed data that we wrote it down before """ meta = TextPreprocessor.__read_data_meta_file() if meta is None: return None processed_data = dict() for lang in meta["languages"]: processed_data_file_name = Advisor.get_data_version_folders_file_path(lang, version, data_file_name, "json") if path.exists(processed_data_file_name) is False: return None with open(processed_data_file_name) as json_file: processed_data[lang] = json.load(json_file) json_file.close() return processed_data
def plot_metrics(cls, metrics: dict, explore_param: str, lang, model_name, data_version, dict_version, model_version): plot_no = len(metrics) fig, axes = pyplot.subplots(plot_no, 1, figsize=(2 * 8.0, (plot_no + 1) * 5.0)) if plot_no > 1: axes = axes.flatten() else: axes = [axes] for i, metric in enumerate(metrics): data = DataFrame(metrics[metric], columns=[explore_param, metric]) ax = axes[i] ax.plot(data[explore_param], data[metric], marker='o', ) ax.set_title("Coherence of %s" % metric) ax.set_xlabel(explore_param) ax.set_xticks(data[explore_param]) file_name = "%s-%s-%s" % (model_name, explore_param, model_version) fig_file_name = Advisor.get_model_version_folders_file_path(lang, data_version, dict_version, model_version, file_name, "png") fig.savefig(fig_file_name) pyplot.close(fig=fig) return
def word_cloud_of_top_n_words_in_each_topic(self, top_n: int, lda_model, model_type): cloud = WordCloud(background_color='white', max_words=top_n, max_font_size=5 * top_n, prefer_horizontal=1.0, font_step=5) topics = lda_model.show_topics(formatted=False, num_topics=-1, num_words=30) for topic in topics: topic_words = dict(topic[1]) try: data_file_path = Advisor.get_topic_folders_file_path_from_topic_version( self.topic_version_path, model_type, topic[0], "WordCloud", 'png') cloud.generate_from_frequencies(topic_words, max_font_size=10 * top_n) cloud.to_file(data_file_path.format(topic[0])) except OSError as err: logging.error(err) return
def topic_words_and_its_joint(self, model, model_type, processed_data): topics = model.show_topics(formatted=False, num_topics=-1, num_words=30) all_topics_words = self._get_topic_words_weight_and_words_counter( model, processed_data) for topic in topics: topic_and_its_words_in_whole_data = dict() for word, weight in topic[1]: group = all_topics_words.loc[all_topics_words.word == word].groupby("topic_id") for topic_id, out_line in group: topic_id = int(topic_id) if topic_and_its_words_in_whole_data.get(topic_id, None) is None: topic_and_its_words_in_whole_data[topic_id] = list() topic_and_its_words_in_whole_data[topic_id].append([ word, out_line.importance.values[0], out_line.word_count.values[0] ]) sub_plot_number = len(topic_and_its_words_in_whole_data) fig, axes = pyplot.subplots(sub_plot_number + 1, 1, figsize=(2 * 8.0, (sub_plot_number + 1) * 5.0)) if len(topic_and_its_words_in_whole_data) > 1: axes = axes.flatten() word_count_ax = axes[0] topic_word_count = DataFrame( topic_and_its_words_in_whole_data[int(topic[0])], columns=['word', 'word_importance', 'word_count']) word_count_ax.bar(x='word', height="word_count", data=topic_word_count, width=0.5, alpha=0.3) self._set_bar_plot_text(word_count_ax) word_count_ax.set_ylabel('Word Count') word_count_ax.set_title('Topic: %d Word Count' % topic[0], fontsize=12) word_count_ax.set_xticklabels(topic_word_count.word, rotation=30, horizontalalignment='right', fontsize=8) ax_n = 1 for i in topic_and_its_words_in_whole_data: tdf = DataFrame( topic_and_its_words_in_whole_data[i], columns=['word', 'word_importance', 'word_count']) topic_word_ax = axes[ax_n] topic_word_ax.bar(x='word', height="word_importance", data=tdf, width=0.2) topic_word_ax.set_title('Topic: %d Word Weight' % i, fontsize=12) topic_word_ax.set_xticklabels(tdf.word, rotation=30, horizontalalignment='right', fontsize=8) self._set_bar_plot_text(topic_word_ax) ax_n += 1 # fig.tight_layout(w_pad=2) fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=18, y=1.05) file_name = "WordCountsOfTopicKeywords" fig_file_name = Advisor.get_topic_folders_file_path_from_topic_version( self.topic_version_path, model_type, topic[0], file_name, "png") fig.savefig(fig_file_name) pyplot.close(fig=fig) return