Пример #1
0
def make_request(page_name, section_id):
    session = requests.Session()

    url = "https://en.wikipedia.org/w/api.php"

    params = {
        "action": "parse",  # "parse","query",
        "page": page_name,
        "prop": "wikitext",  # "wikitext","parsetree", "text"
        "section": section_id,
        "format": "json"
    }

    result = session.get(url=url, params=params)
    data = result.json()

    text = data["parse"]["wikitext"]["*"]
    redirect_page = False
    if text.startswith('this') and "[[" in text and "]]" in text:
        redirect_page = True

    logger = LogManager.instance()
    if logger.debug_enabled():
        file_name = page_name + ".txt"
        full_path = util.get_full_output_path(file_name)
        text_file = open(full_path, "w", encoding="utf-8")
        text_file.write(text)
        text_file.close()
    return text, redirect_page
Пример #2
0
def plot_results(analysis, silent_mode_enabled=True):
    import pandas as pd

    y_test = analysis.data_info.y_test
    y_pred = analysis.results_info.prediction
    if y_test is not None and y_pred is not None:
        df = pd.DataFrame({
            analysis.data_info.y_label + ' Actual':
            y_test.flatten(),
            analysis.data_info.y_label + ' Predicted':
            y_pred.flatten()
        })
        df1 = df.head(25)
        axes_subplot = df1.plot(title=analysis.data_info.x_label,
                                kind='bar',
                                figsize=(16, 10))
        plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
        plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
        if not silent_mode_enabled:
            plt.show()

        fig = axes_subplot.get_figure()
        logger = LogManager.instance()
        if logger.debug_enabled():
            from my_package import util as util
            file_name = 'Result_' + analysis.data_info.x_label + " _ " + analysis.data_info.y_label + ' Actual' + 'vs' + analysis.data_info.y_label + ' Predicted' + ".png"
            full_path = util.get_full_output_path(file_name)

            fig.savefig(full_path)

        plt.close(fig)
Пример #3
0
def scatter_plot_results(analysis, silent_mode_enabled=True):
    y_test = analysis.data_info.y_test
    y_pred = analysis.results_info.prediction
    x_test = analysis.data_info.x_test
    if y_test is not None and y_pred is not None:
        fig = plt.figure(figsize=(10, 10))
        plt.scatter(x_test, y_test, color='gray')
        plt.plot(x_test, y_pred, color='red', linewidth=2)
        plt.title(analysis.data_info.x_label + ' vs ' +
                  analysis.data_info.y_label)
        plt.xlabel(analysis.data_info.x_label)
        plt.ylabel(analysis.data_info.y_label)
        if not silent_mode_enabled:
            plt.show()

        logger = LogManager.instance()
        if logger.debug_enabled():
            from my_package import util as util
            file_name = 'Scatter_Results_' + analysis.data_info.x_label + " _ " + analysis.data_info.y_label + ' Actual' + 'vs' + analysis.data_info.y_label + ' Predicted' + ".png"
            full_path = util.get_full_output_path(file_name)
            fig.savefig(full_path,
                        dpi=fig.dpi,
                        bbox_inches='tight',
                        pad_inches=0.5)

        plt.close(fig)
Пример #4
0
def scatter_plots(analysis_list, plot_title, silen_mode_enabled=True):
    plot_size = get_plot_size(len(analysis_list))

    fig, axs = plt.subplots(plot_size["row_size"],
                            plot_size["col_size"],
                            figsize=(20, 10),
                            gridspec_kw={
                                'wspace': 0.5,
                                'hspace': 0.55
                            })
    fig.suptitle(plot_title, fontsize=15)
    axs = axs.flatten()
    for analysis, ax in zip(analysis_list, axs):
        x_value = analysis.data_info.x_values
        y_value = analysis.data_info.y_values
        x_title = analysis.data_info.x_label
        y_title = analysis.data_info.y_label
        intercept = analysis.results_info.intercept
        coef = analysis.results_info.coefficient
        analysis_type = analysis.type

        ax.scatter(x_value, y_value, color='r')
        ax.set_title(analysis_type, size=10, color="g")
        ax.set_xlabel(x_title, size=7, color="y")
        ax.set_ylabel(y_title, size=7, color="y")

        intercept = intercept[0]
        coef = coef[0][0]
        # p5 = ax.plot([1, 2, 3], [1, 2, 3], "r--")
        xx = np.zeros(2)
        yy = np.zeros(2)
        xx[0] = 0
        yy[0] = intercept
        xx[1] = np.max(x_value)
        yy[1] = np.max(x_value) * coef + intercept

        eq_line = "y = " + "{:10.3f}".format(coef) + "x" + "{:10.3f}".format(
            intercept)
        ax.plot(xx, yy, linestyle="solid", label=eq_line)

    if not silen_mode_enabled:
        plt.show()

    logger = LogManager.instance()
    if logger.debug_enabled():
        from my_package import util as util

        file_name = plot_title + ".png"
        full_path = util.get_full_output_path(file_name)
        fig.savefig(full_path)

    plt.close(fig)
Пример #5
0
    def do_parsing(self, text):
        logger = LogManager.instance()
        logger.log("Parsing table text!", logger.Logging_Levels["DEBUG"])

        parsed = wtp.parse(text)
        table_info = parsed.tables[0].data()

        df_parsed_table = pd.DataFrame(columns=TableParser.column_type.keys())
        for row_index in range(1, len(table_info), 1):
            cells = table_info[row_index]
            if logger.debug_enabled():
                logger.log(str(cells), logger.Logging_Levels["DEBUG"])

            # extract data
            museum_info = cells[TableParser.column_type["museum"]]
            city_info = cells[TableParser.column_type["city"]]
            visitor_info = cells[TableParser.column_type["visitor"]]
            year_info = cells[TableParser.column_type["year"]]

            # perform cleanup only after headers
            museum_info_2 = None
            if row_index > 1:
                city_info = city_info.split("[[")[1].split(']]')[0]
                year_info = year_info.split('<ref')[0]
                # post-process museum information
                if "[[" in museum_info:
                    museum_info = museum_info.split("[[")[1].split(']]')[0]
                    if "|" in museum_info:
                        # This can happen when we have 2 language like the case of Mexico City:
                        # '[[Museo Nacional de Historia|National Museum of History]]'
                        # best solution is to add 2 records
                        result = museum_info.split("|")
                        museum_info = result[0]
                        museum_info_2 = result[1]
                elif "|" in museum_info:
                    museum_info = museum_info.split("|")[1].split('|')[0]

            # save data
            df_parsed_table = df_parsed_table.append(
                pd.Series([museum_info, city_info, visitor_info, year_info], index=TableParser.column_type.keys()),
                ignore_index=True)
            if museum_info_2 is not None:
                df_parsed_table = df_parsed_table.append(pd.Series([museum_info_2, city_info, visitor_info, year_info],
                                                                   index=TableParser.column_type.keys()),
                                                         ignore_index=True)

        if logger.debug_enabled():
            file_name = "List_of_most_visited_museums_table.csv"
            full_path = util.get_full_output_path(file_name)
            df_parsed_table.to_csv(full_path, index=None, header=True)
        return df_parsed_table
Пример #6
0
def plot_data_distribution(data, file_name, silent_mode_enabled=True):
    if silent_mode_enabled:
        return

    plt.figure(figsize=(15, 10))
    plt.tight_layout()
    seaborn_plot = seabornInstance.distplot(data)
    plt.show()

    logger = LogManager.instance()
    if logger.debug_enabled():
        full_path = util.get_full_output_path(file_name)
        fig = seaborn_plot.get_figure()
        fig.savefig(full_path)
Пример #7
0
def make_request_csv():
    session = requests.Session()

    url = "http://www.worldcitiescultureforum.com/assets/city_data/Number_of_international_tourists_per_year_7112018.csv"

    result = session.get(url=url)
    text = result.text
    logger = LogManager.instance()
    if logger.debug_enabled():
        file_name = "international_tourists" + ".txt"
        full_path = util.get_full_output_path(file_name)
        text_file = open(full_path, "w", encoding="utf-8")
        text_file.write(text)
        text_file.close()
    return text
Пример #8
0
def plot_data(dataset, labels, silen_mode_enabled=True):
    axes_subplot = dataset.plot(x=labels["x"][0], y=labels["y"][0], style='o')
    plt.title(labels["x"][1] + ' vs ' + labels["y"][1])
    plt.xlabel(labels["x"][1])
    plt.ylabel(labels["y"][1])

    if not silen_mode_enabled:
        plt.show()

    fig = axes_subplot.get_figure()
    logger = LogManager.instance()
    if logger.debug_enabled():
        from my_package import util as util
        file_name = 'Scatter_Data_' + labels["x"][1] + 'vs' + labels["y"][
            1] + ".png"
        full_path = util.get_full_output_path(file_name)
        fig.savefig(full_path)

    plt.close(fig)
Пример #9
0
def main():
    logger = LogManager.instance()
    logger.log("Staring the application!", logger.Logging_Levels["DEBUG"])
    database_manager = DatabaseManager.instance()
    database_manager.init(config)

    print_main_menu()
    general_action = input()
    if general_action == 'e':
        return

    if general_action == 'm':
        ml.MachineLearningManager.do_analysis(config)
        return

    if general_action == 'd':
        database_manager.delete_all_data()
        data_fetch.DataFetchManager.fetch_data(config)
        return
Пример #10
0
def missingdata_plot(dataframe, silent_mode_enabled=True):
    museum_name = dataframe["museum"].tolist()  # ----------x-axis
    dataframe = dataframe.drop(columns=["museum", "id", "city_id"
                                        ])  # ----------Independent Features
    features = dataframe.head()
    ##
    df_size = dataframe.shape
    missingdata_matrix = np.zeros(dataframe.shape)

    ##
    df = pd.isna(dataframe)
    df_array = df.to_numpy()
    for i in range(df_array.shape[0]):
        for j in range(df_array.shape[1]):
            if df_array[i][j]:
                missingdata_matrix[i][j] = int(0)
            else:
                missingdata_matrix[i][j] = int(1)
    missingdata_matrix = missingdata_matrix.T
    fig = plt.matshow(missingdata_matrix)
    plt.xlabel('Museum')
    plt.ylabel('Features')
    x = list(range(0, 46))
    y = list(range(0, 12))
    plt.yticks(y, features)
    plt.xticks(x, museum_name, rotation=90)
    plt.colorbar()

    if not silent_mode_enabled:
        plt.show()

    logger = LogManager.instance()
    if logger.debug_enabled():
        from my_package import util as util
        file_name = "missingdata.png"
        full_path = util.get_full_output_path(file_name)
        plt.savefig(full_path)

    plt.close()
Пример #11
0
def residual_plot(analysis, silent_mode_enabled=True):
    x_train = analysis.data_info.x_train
    x_test = analysis.data_info.x_test
    y_train = analysis.data_info.x_train
    y_test = analysis.data_info.x_test
    model = analysis.results_info.model

    if x_train is not None and x_test is not None and \
            y_train is not None and y_test is not None and \
            model is not None:
        fig = plt.figure(figsize=(10, 10))
        plt.scatter(model.predict(x_train),
                    model.predict(x_train) - y_train,
                    color='blue',
                    s=40,
                    alpha=0.5)
        plt.scatter(model.predict(x_test),
                    model.predict(x_test) - y_test,
                    color='green',
                    s=40)
        plt.hlines(y=0, xmin=0, xmax=9000000)
        plt.title("Residual plot using training(blue) and test(green) data; " +
                  analysis.data_info.x_label + ' vs ' +
                  analysis.data_info.y_label)
        plt.ylabel("Residuals")
        if not silent_mode_enabled:
            plt.show()

        logger = LogManager.instance()
        if logger.debug_enabled():
            from my_package import util as util
            file_name = 'Residual_' + analysis.data_info.x_label + "_" + analysis.data_info.y_label + ".png"
            full_path = util.get_full_output_path(file_name)
            fig.savefig(full_path,
                        dpi=fig.dpi,
                        bbox_inches='tight',
                        pad_inches=0.5)

        plt.close(fig)
Пример #12
0
    def do_parsing(self, text):
        logger = LogManager.instance()
        logger.log("Parsing city page text!", logger.Logging_Levels["DEBUG"])

        parsed = wtp.parse(text)
        extracted_data = {}
        for template in parsed.templates:
            if 'infobox' in template.name.lower() or \
                    'infobox settlement' in template.name.lower() or \
                    'infobox country' in template.name.lower() or \
                    'Infobox Russian federal subject' in template.name.lower():
                city_info = template.string
                city_info_list = city_info.split("\n")
                for info in city_info_list:
                    if "|" in info and "=" in info:
                        key = info.split("|")[1].split('=')[0]
                        key = key.strip()

                        value = info.split("=")[1]
                        if "]]" in value and "[[" in value:
                            value = value.split("[[")[1].split(']]')[0]

                        extracted_data[key] = value

                if logger.debug_enabled():
                    print(city_info)
                    file_name = "city_info.txt"
                    full_path = util.get_full_output_path(file_name)
                    with open(full_path, "w", encoding="utf-8") as file:
                        file.write(city_info)

                return extracted_data

        if len(extracted_data) == 0:
            logger.log("invalid case, extracted_data is empty!!", logger.Logging_Levels["ERROR"])
        return extracted_data
Пример #13
0
    def fetch_data(config):
        logger = LogManager.instance()
        page_name = config.main_page_name
        parser_list = [ParserGenerator.parser_types['table'], ParserGenerator.parser_types['infobox']]
        parser_instance = ParserGenerator(parser_list)
        # todo: move this to a parser for csv table
        table2 = extractor.make_request_csv()
        tables_txt = table2.splitlines()
        head = ["city", "city_visitor", "city_visitor_reported_year"]
        import pandas as pd
        df_parsed_table_2 = pd.DataFrame(columns=head)
        for i in range(1, len(tables_txt), 1):
            city_info = tables_txt[i]
            delimiter1 = ","
            delimiter2 = '"'
            delimiter3 = ",,"
            test = city_info.split(delimiter3)
            test2 = test[0].split(delimiter2)
            city_name = test2[0]
            city_name = city_name.replace(delimiter1, "")
            city_visitors = test2[1]
            values = test[1].split(delimiter1)
            data = [city_name, city_visitors, values[0]]
            df_parsed_table_2 = df_parsed_table_2.append(pd.Series(data, index=head), ignore_index=True)

        text, _ = extractor.make_request(page_name)
        df_parsed_table = parser_instance.run_function(ParserGenerator.parser_types['table'], text)
        parsed_table = df_parsed_table.values.tolist()
        # todo: group by city so you retrive city page only once
        #      take advantage of panda
        for i in range(1, len(parsed_table), 1):
            city_name = parsed_table[i][1]
            print(city_name)
            text, redirect_page = extractor.make_request(city_name)
            if redirect_page:
                city_name = text.split('[[')[1].split(']]')[0]
                text = extractor.make_request(city_name)[0]

            extracted_city_infos = parser_instance.run_function(ParserGenerator.parser_types['infobox'], text)

            if logger.debug_enabled():
                file_name = city_name + "_info.txt"
                full_path = util.get_full_output_path(file_name)
                if len(extracted_city_infos) > 0:
                    with open(full_path, "w", encoding="utf-8") as file:
                        for key, value in extracted_city_infos.items():
                            file.write(key + ": " + value + "\n")

            museum_name = parsed_table[i][0]
            print(museum_name)

            # I might look at category for "Tokyo Metropolitan Art Museum"
            # there I might have link to real website
            # Category: National Museum of Nature and Science
            if 'Zhejiang Museum' in museum_name or \
                    'Chongqing Museum of Natural History' in museum_name or \
                    "Mevlana Museum" in museum_name or \
                    "Tokyo Metropolitan Art Museum" in museum_name or \
                    "Chengdu Museum" in museum_name or \
                    "Royal Museums Greenwich" in museum_name or \
                    "National Museum of Nature and Science" in museum_name or \
                    "Suzhou Museum" in museum_name or \
                    "Three Gorges Museum" in museum_name or \
                    "Russian Museum" in museum_name:
                # bad website can not extract it is information, missing data case
                # escape it
                continue;

            # invalid case, page does not exist
            if "Reina Sofía" in museum_name or \
                    "National Art Center" in museum_name or \
                    "Museo Nacional de Historia" in museum_name or \
                    "NGV International" in museum_name:
                continue

            text, redirect_page = extractor.make_request(museum_name)
            if redirect_page:
                museum_name = text.split('[[')[1].split(']]')[0]
                text = extractor.make_request(museum_name)[0]

            extracted_museum_infos = parser_instance.run_function(ParserGenerator.parser_types['infobox'], text)
            # Remove all special characters, punctuation and spaces from string
            new_name = re.sub('[^A-Za-z0-9]+', '', museum_name)

            if logger.debug_enabled():
                file_name = new_name + "_info.txt"
                full_path = util.get_full_output_path(file_name)
                if len(extracted_museum_infos) > 0:
                    with open(full_path, "w", encoding="utf-8") as file:
                        for key, value in extracted_museum_infos.items():
                            file.write(key + ": " + value + "\n")
            #  todo: move this to its ovn function to post-process
            # save city and one of its museums in a database
            extracted_city_infos["name"] = parsed_table[i][TableParser.column_type["city"]]
            extracted_museum_infos["name"] = parsed_table[i][TableParser.column_type["museum"]]
            extracted_museum_infos["visitors"] = parsed_table[i][TableParser.column_type["visitor"]]
            extracted_museum_infos["year"] = parsed_table[i][TableParser.column_type["year"]]

            city_visitor_info = df_parsed_table_2[df_parsed_table_2['city'] == extracted_city_infos["name"]]
            if (len(city_visitor_info) > 0):
                extracted_city_infos["city_visitor"] = city_visitor_info["city_visitor"].to_string(index=False)
                extracted_city_infos["city_visitor_reported_year"] = city_visitor_info[
                    "city_visitor_reported_year"].to_string(index=False)

            argument_list = {'city': extracted_city_infos, "museum": extracted_museum_infos}  # percent of original size
            database_manager = DatabaseManager.instance()
            database_manager.save(**argument_list)