def make_request(page_name, section_id): session = requests.Session() url = "https://en.wikipedia.org/w/api.php" params = { "action": "parse", # "parse","query", "page": page_name, "prop": "wikitext", # "wikitext","parsetree", "text" "section": section_id, "format": "json" } result = session.get(url=url, params=params) data = result.json() text = data["parse"]["wikitext"]["*"] redirect_page = False if text.startswith('this') and "[[" in text and "]]" in text: redirect_page = True logger = LogManager.instance() if logger.debug_enabled(): file_name = page_name + ".txt" full_path = util.get_full_output_path(file_name) text_file = open(full_path, "w", encoding="utf-8") text_file.write(text) text_file.close() return text, redirect_page
def plot_results(analysis, silent_mode_enabled=True): import pandas as pd y_test = analysis.data_info.y_test y_pred = analysis.results_info.prediction if y_test is not None and y_pred is not None: df = pd.DataFrame({ analysis.data_info.y_label + ' Actual': y_test.flatten(), analysis.data_info.y_label + ' Predicted': y_pred.flatten() }) df1 = df.head(25) axes_subplot = df1.plot(title=analysis.data_info.x_label, kind='bar', figsize=(16, 10)) plt.grid(which='major', linestyle='-', linewidth='0.5', color='green') plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black') if not silent_mode_enabled: plt.show() fig = axes_subplot.get_figure() logger = LogManager.instance() if logger.debug_enabled(): from my_package import util as util file_name = 'Result_' + analysis.data_info.x_label + " _ " + analysis.data_info.y_label + ' Actual' + 'vs' + analysis.data_info.y_label + ' Predicted' + ".png" full_path = util.get_full_output_path(file_name) fig.savefig(full_path) plt.close(fig)
def scatter_plot_results(analysis, silent_mode_enabled=True): y_test = analysis.data_info.y_test y_pred = analysis.results_info.prediction x_test = analysis.data_info.x_test if y_test is not None and y_pred is not None: fig = plt.figure(figsize=(10, 10)) plt.scatter(x_test, y_test, color='gray') plt.plot(x_test, y_pred, color='red', linewidth=2) plt.title(analysis.data_info.x_label + ' vs ' + analysis.data_info.y_label) plt.xlabel(analysis.data_info.x_label) plt.ylabel(analysis.data_info.y_label) if not silent_mode_enabled: plt.show() logger = LogManager.instance() if logger.debug_enabled(): from my_package import util as util file_name = 'Scatter_Results_' + analysis.data_info.x_label + " _ " + analysis.data_info.y_label + ' Actual' + 'vs' + analysis.data_info.y_label + ' Predicted' + ".png" full_path = util.get_full_output_path(file_name) fig.savefig(full_path, dpi=fig.dpi, bbox_inches='tight', pad_inches=0.5) plt.close(fig)
def scatter_plots(analysis_list, plot_title, silen_mode_enabled=True): plot_size = get_plot_size(len(analysis_list)) fig, axs = plt.subplots(plot_size["row_size"], plot_size["col_size"], figsize=(20, 10), gridspec_kw={ 'wspace': 0.5, 'hspace': 0.55 }) fig.suptitle(plot_title, fontsize=15) axs = axs.flatten() for analysis, ax in zip(analysis_list, axs): x_value = analysis.data_info.x_values y_value = analysis.data_info.y_values x_title = analysis.data_info.x_label y_title = analysis.data_info.y_label intercept = analysis.results_info.intercept coef = analysis.results_info.coefficient analysis_type = analysis.type ax.scatter(x_value, y_value, color='r') ax.set_title(analysis_type, size=10, color="g") ax.set_xlabel(x_title, size=7, color="y") ax.set_ylabel(y_title, size=7, color="y") intercept = intercept[0] coef = coef[0][0] # p5 = ax.plot([1, 2, 3], [1, 2, 3], "r--") xx = np.zeros(2) yy = np.zeros(2) xx[0] = 0 yy[0] = intercept xx[1] = np.max(x_value) yy[1] = np.max(x_value) * coef + intercept eq_line = "y = " + "{:10.3f}".format(coef) + "x" + "{:10.3f}".format( intercept) ax.plot(xx, yy, linestyle="solid", label=eq_line) if not silen_mode_enabled: plt.show() logger = LogManager.instance() if logger.debug_enabled(): from my_package import util as util file_name = plot_title + ".png" full_path = util.get_full_output_path(file_name) fig.savefig(full_path) plt.close(fig)
def do_parsing(self, text): logger = LogManager.instance() logger.log("Parsing table text!", logger.Logging_Levels["DEBUG"]) parsed = wtp.parse(text) table_info = parsed.tables[0].data() df_parsed_table = pd.DataFrame(columns=TableParser.column_type.keys()) for row_index in range(1, len(table_info), 1): cells = table_info[row_index] if logger.debug_enabled(): logger.log(str(cells), logger.Logging_Levels["DEBUG"]) # extract data museum_info = cells[TableParser.column_type["museum"]] city_info = cells[TableParser.column_type["city"]] visitor_info = cells[TableParser.column_type["visitor"]] year_info = cells[TableParser.column_type["year"]] # perform cleanup only after headers museum_info_2 = None if row_index > 1: city_info = city_info.split("[[")[1].split(']]')[0] year_info = year_info.split('<ref')[0] # post-process museum information if "[[" in museum_info: museum_info = museum_info.split("[[")[1].split(']]')[0] if "|" in museum_info: # This can happen when we have 2 language like the case of Mexico City: # '[[Museo Nacional de Historia|National Museum of History]]' # best solution is to add 2 records result = museum_info.split("|") museum_info = result[0] museum_info_2 = result[1] elif "|" in museum_info: museum_info = museum_info.split("|")[1].split('|')[0] # save data df_parsed_table = df_parsed_table.append( pd.Series([museum_info, city_info, visitor_info, year_info], index=TableParser.column_type.keys()), ignore_index=True) if museum_info_2 is not None: df_parsed_table = df_parsed_table.append(pd.Series([museum_info_2, city_info, visitor_info, year_info], index=TableParser.column_type.keys()), ignore_index=True) if logger.debug_enabled(): file_name = "List_of_most_visited_museums_table.csv" full_path = util.get_full_output_path(file_name) df_parsed_table.to_csv(full_path, index=None, header=True) return df_parsed_table
def plot_data_distribution(data, file_name, silent_mode_enabled=True): if silent_mode_enabled: return plt.figure(figsize=(15, 10)) plt.tight_layout() seaborn_plot = seabornInstance.distplot(data) plt.show() logger = LogManager.instance() if logger.debug_enabled(): full_path = util.get_full_output_path(file_name) fig = seaborn_plot.get_figure() fig.savefig(full_path)
def make_request_csv(): session = requests.Session() url = "http://www.worldcitiescultureforum.com/assets/city_data/Number_of_international_tourists_per_year_7112018.csv" result = session.get(url=url) text = result.text logger = LogManager.instance() if logger.debug_enabled(): file_name = "international_tourists" + ".txt" full_path = util.get_full_output_path(file_name) text_file = open(full_path, "w", encoding="utf-8") text_file.write(text) text_file.close() return text
def plot_data(dataset, labels, silen_mode_enabled=True): axes_subplot = dataset.plot(x=labels["x"][0], y=labels["y"][0], style='o') plt.title(labels["x"][1] + ' vs ' + labels["y"][1]) plt.xlabel(labels["x"][1]) plt.ylabel(labels["y"][1]) if not silen_mode_enabled: plt.show() fig = axes_subplot.get_figure() logger = LogManager.instance() if logger.debug_enabled(): from my_package import util as util file_name = 'Scatter_Data_' + labels["x"][1] + 'vs' + labels["y"][ 1] + ".png" full_path = util.get_full_output_path(file_name) fig.savefig(full_path) plt.close(fig)
def main(): logger = LogManager.instance() logger.log("Staring the application!", logger.Logging_Levels["DEBUG"]) database_manager = DatabaseManager.instance() database_manager.init(config) print_main_menu() general_action = input() if general_action == 'e': return if general_action == 'm': ml.MachineLearningManager.do_analysis(config) return if general_action == 'd': database_manager.delete_all_data() data_fetch.DataFetchManager.fetch_data(config) return
def missingdata_plot(dataframe, silent_mode_enabled=True): museum_name = dataframe["museum"].tolist() # ----------x-axis dataframe = dataframe.drop(columns=["museum", "id", "city_id" ]) # ----------Independent Features features = dataframe.head() ## df_size = dataframe.shape missingdata_matrix = np.zeros(dataframe.shape) ## df = pd.isna(dataframe) df_array = df.to_numpy() for i in range(df_array.shape[0]): for j in range(df_array.shape[1]): if df_array[i][j]: missingdata_matrix[i][j] = int(0) else: missingdata_matrix[i][j] = int(1) missingdata_matrix = missingdata_matrix.T fig = plt.matshow(missingdata_matrix) plt.xlabel('Museum') plt.ylabel('Features') x = list(range(0, 46)) y = list(range(0, 12)) plt.yticks(y, features) plt.xticks(x, museum_name, rotation=90) plt.colorbar() if not silent_mode_enabled: plt.show() logger = LogManager.instance() if logger.debug_enabled(): from my_package import util as util file_name = "missingdata.png" full_path = util.get_full_output_path(file_name) plt.savefig(full_path) plt.close()
def residual_plot(analysis, silent_mode_enabled=True): x_train = analysis.data_info.x_train x_test = analysis.data_info.x_test y_train = analysis.data_info.x_train y_test = analysis.data_info.x_test model = analysis.results_info.model if x_train is not None and x_test is not None and \ y_train is not None and y_test is not None and \ model is not None: fig = plt.figure(figsize=(10, 10)) plt.scatter(model.predict(x_train), model.predict(x_train) - y_train, color='blue', s=40, alpha=0.5) plt.scatter(model.predict(x_test), model.predict(x_test) - y_test, color='green', s=40) plt.hlines(y=0, xmin=0, xmax=9000000) plt.title("Residual plot using training(blue) and test(green) data; " + analysis.data_info.x_label + ' vs ' + analysis.data_info.y_label) plt.ylabel("Residuals") if not silent_mode_enabled: plt.show() logger = LogManager.instance() if logger.debug_enabled(): from my_package import util as util file_name = 'Residual_' + analysis.data_info.x_label + "_" + analysis.data_info.y_label + ".png" full_path = util.get_full_output_path(file_name) fig.savefig(full_path, dpi=fig.dpi, bbox_inches='tight', pad_inches=0.5) plt.close(fig)
def do_parsing(self, text): logger = LogManager.instance() logger.log("Parsing city page text!", logger.Logging_Levels["DEBUG"]) parsed = wtp.parse(text) extracted_data = {} for template in parsed.templates: if 'infobox' in template.name.lower() or \ 'infobox settlement' in template.name.lower() or \ 'infobox country' in template.name.lower() or \ 'Infobox Russian federal subject' in template.name.lower(): city_info = template.string city_info_list = city_info.split("\n") for info in city_info_list: if "|" in info and "=" in info: key = info.split("|")[1].split('=')[0] key = key.strip() value = info.split("=")[1] if "]]" in value and "[[" in value: value = value.split("[[")[1].split(']]')[0] extracted_data[key] = value if logger.debug_enabled(): print(city_info) file_name = "city_info.txt" full_path = util.get_full_output_path(file_name) with open(full_path, "w", encoding="utf-8") as file: file.write(city_info) return extracted_data if len(extracted_data) == 0: logger.log("invalid case, extracted_data is empty!!", logger.Logging_Levels["ERROR"]) return extracted_data
def fetch_data(config): logger = LogManager.instance() page_name = config.main_page_name parser_list = [ParserGenerator.parser_types['table'], ParserGenerator.parser_types['infobox']] parser_instance = ParserGenerator(parser_list) # todo: move this to a parser for csv table table2 = extractor.make_request_csv() tables_txt = table2.splitlines() head = ["city", "city_visitor", "city_visitor_reported_year"] import pandas as pd df_parsed_table_2 = pd.DataFrame(columns=head) for i in range(1, len(tables_txt), 1): city_info = tables_txt[i] delimiter1 = "," delimiter2 = '"' delimiter3 = ",," test = city_info.split(delimiter3) test2 = test[0].split(delimiter2) city_name = test2[0] city_name = city_name.replace(delimiter1, "") city_visitors = test2[1] values = test[1].split(delimiter1) data = [city_name, city_visitors, values[0]] df_parsed_table_2 = df_parsed_table_2.append(pd.Series(data, index=head), ignore_index=True) text, _ = extractor.make_request(page_name) df_parsed_table = parser_instance.run_function(ParserGenerator.parser_types['table'], text) parsed_table = df_parsed_table.values.tolist() # todo: group by city so you retrive city page only once # take advantage of panda for i in range(1, len(parsed_table), 1): city_name = parsed_table[i][1] print(city_name) text, redirect_page = extractor.make_request(city_name) if redirect_page: city_name = text.split('[[')[1].split(']]')[0] text = extractor.make_request(city_name)[0] extracted_city_infos = parser_instance.run_function(ParserGenerator.parser_types['infobox'], text) if logger.debug_enabled(): file_name = city_name + "_info.txt" full_path = util.get_full_output_path(file_name) if len(extracted_city_infos) > 0: with open(full_path, "w", encoding="utf-8") as file: for key, value in extracted_city_infos.items(): file.write(key + ": " + value + "\n") museum_name = parsed_table[i][0] print(museum_name) # I might look at category for "Tokyo Metropolitan Art Museum" # there I might have link to real website # Category: National Museum of Nature and Science if 'Zhejiang Museum' in museum_name or \ 'Chongqing Museum of Natural History' in museum_name or \ "Mevlana Museum" in museum_name or \ "Tokyo Metropolitan Art Museum" in museum_name or \ "Chengdu Museum" in museum_name or \ "Royal Museums Greenwich" in museum_name or \ "National Museum of Nature and Science" in museum_name or \ "Suzhou Museum" in museum_name or \ "Three Gorges Museum" in museum_name or \ "Russian Museum" in museum_name: # bad website can not extract it is information, missing data case # escape it continue; # invalid case, page does not exist if "Reina Sofía" in museum_name or \ "National Art Center" in museum_name or \ "Museo Nacional de Historia" in museum_name or \ "NGV International" in museum_name: continue text, redirect_page = extractor.make_request(museum_name) if redirect_page: museum_name = text.split('[[')[1].split(']]')[0] text = extractor.make_request(museum_name)[0] extracted_museum_infos = parser_instance.run_function(ParserGenerator.parser_types['infobox'], text) # Remove all special characters, punctuation and spaces from string new_name = re.sub('[^A-Za-z0-9]+', '', museum_name) if logger.debug_enabled(): file_name = new_name + "_info.txt" full_path = util.get_full_output_path(file_name) if len(extracted_museum_infos) > 0: with open(full_path, "w", encoding="utf-8") as file: for key, value in extracted_museum_infos.items(): file.write(key + ": " + value + "\n") # todo: move this to its ovn function to post-process # save city and one of its museums in a database extracted_city_infos["name"] = parsed_table[i][TableParser.column_type["city"]] extracted_museum_infos["name"] = parsed_table[i][TableParser.column_type["museum"]] extracted_museum_infos["visitors"] = parsed_table[i][TableParser.column_type["visitor"]] extracted_museum_infos["year"] = parsed_table[i][TableParser.column_type["year"]] city_visitor_info = df_parsed_table_2[df_parsed_table_2['city'] == extracted_city_infos["name"]] if (len(city_visitor_info) > 0): extracted_city_infos["city_visitor"] = city_visitor_info["city_visitor"].to_string(index=False) extracted_city_infos["city_visitor_reported_year"] = city_visitor_info[ "city_visitor_reported_year"].to_string(index=False) argument_list = {'city': extracted_city_infos, "museum": extracted_museum_infos} # percent of original size database_manager = DatabaseManager.instance() database_manager.save(**argument_list)