Python normalize_hebrew示例，utils.normalize_hebrew Python示例

示例#1

0

显示文件

def plot_histogram(x_values: List[float],
                   title: str,
                   x_label: str,
                   y_label: str,
                   bins: int,
                   _range: Tuple[int, int],
                   size_of_tick: int = None):

    # title = normalize_hebrew(title)
    s = pd.Series(x_values)
    plt = s.plot(kind='hist', bins=bins, title=title, range=_range)

    # plt.axes.set_title("Title", fontsize=15)
    plt.set_xlabel("X Label", fontsize=10)
    plt.set_ylabel("Y Label", fontsize=10)

    x_label = normalize_hebrew(x_label)
    y_label = normalize_hebrew(y_label)
    plt.set_xlabel(x_label)
    plt.set_ylabel(y_label)

    first_bin, last_bin = _range
    if size_of_tick is None:
        size_of_tick = (last_bin + 1) // bins
    plt.set_xticks(list(range(first_bin, last_bin + 1, size_of_tick or 1)))
    return title

示例#2

0

显示文件

def speed_test_website_main(websites: List[str]):
    number_of_websites = len(websites) + 1  # +1 for normal distribution
    number_of_rows = 2
    row_size = (number_of_websites // 2) + number_of_websites % 2
    fig, axs = plot.subplots(number_of_rows,
                             row_size,
                             sharey=True,
                             tight_layout=True)

    row_size = 3
    i = j = 0
    while websites:
        if i == row_size:
            j += 1
            i = 0

        website = websites.pop()
        print(f"getting data for: '{website}'")
        ratios = get_speed_test_ratios(website)
        axs[j, i].hist(ratios, bins=20, range=(0, 2))
        axs[j, i].set_title(normalize_hebrew(WEBSITE_NAME_TRANSLATE[website]))
        i += 1

    # add normal distribution data: μ, σ = 1.0, 0.33
    axs[1, 2].hist([numpy.random.normal(1.0, 0.33) for x in range(400000)],
                   bins=20,
                   range=(0, 2))
    axs[1, 2].set_title(normalize_hebrew('התפלגות נורמלית אקראית'))

    x_label = normalize_hebrew('יחס בדיקת אתר למהירות בפועל')
    x_label += ' ' * 130
    y_label = normalize_hebrew('מספר בדיקות')
    plot.xlabel(x_label)
    plot.ylabel(y_label)
    snapshots_path = Path("question_snapshots") / Path(
        'website_comparison_histograms')
    if not os.path.exists(snapshots_path):
        os.makedirs(snapshots_path)

    plot.legend(loc='best')
    plot.savefig(snapshots_path / Path('השוואת אתרי בדיקה' + ".png"))
    plot.show(
    )  # `.show` has to be after `.savefig` or else all hell breaks loose

示例#3

0

显示文件

def ground_truth_main():
    violin_data = defaultdict(list)
    internet_speeds = [40, 100, 200]
    for users in [VendorUsers.Bezeq, VendorUsers.Hot, VendorUsers.Partner]:
        for speed in internet_speeds:
            num_users = count_users(users, speed)
            if num_users < 8:
                print(f'not enough users for {users}, {speed} skipping')
                continue
            ground_truth_rates = get_ground_truth_speeds_by_vendor(
                users, speed)
            vendor_name_hebrew = normalize_hebrew(vendor_to_hebrew_name(users))

            # create violin data
            for rate in ground_truth_rates:
                violin_data[speed].append({
                    VENDOR_HEBREW: vendor_name_hebrew,
                    # "תכנית": speed,
                    TEST_RESULT_HEBREW: rate
                })
            if not ground_truth_rates:
                continue
            plot_ground_truth_speeds(vendor_users=users,
                                     ratios=ground_truth_rates,
                                     speed=speed)

    for speed in internet_speeds:
        raw_title = "התפלגות מהירויות דגימה תכנית: " + str(speed) + " מגה-ביט"
        speed_violin_data = pd.DataFrame.from_dict(violin_data[speed])
        sns_plot = sns.violinplot(x=speed_violin_data[VENDOR_HEBREW],
                                  y=speed_violin_data[TEST_RESULT_HEBREW])
        sns_plot.set(title=normalize_hebrew(raw_title))

        snapshots_path = Path("question_snapshots") / Path(
            'ground_truth_violin_plots')
        if not os.path.exists(snapshots_path):
            os.makedirs(snapshots_path)

        fig_path = snapshots_path / Path(raw_title + ".png")
        plot.savefig(fig_path)
        print(f"saving {fig_path}")
        plot.show()

示例#4

0

显示文件

def plot_ground_truth_speeds(vendor_users: VendorUsers, ratios: List[float],
                             speed: int):
    title = f'היסטוגרמה מהירות הורדה: ' + vendor_to_hebrew_name(vendor_users)
    title += ' תכנית ' + str(speed) + ' מגה-ביט'
    last_xtick = speed if speed < 200 else 220
    plt = plot_histogram(x_values=ratios,
                         title=normalize_hebrew(title),
                         x_label='מהירות',
                         y_label='מספר בדיקות',
                         bins=speed // 10,
                         _range=(0, last_xtick),
                         size_of_tick=20 if speed == 200 else None)

    snapshots_path = Path("question_snapshots") / Path(
        'ground_truth_rate_histograms')
    if not os.path.exists(snapshots_path):
        os.makedirs(snapshots_path)

    fig_path = snapshots_path / Path(title + ".png")
    print(f"saving: {fig_path}")
    plot.savefig(fig_path)
    plot.show()

示例#5

0

显示文件

def speed_test_website_scatter():
    engine = get_engine()

    all_resources_query = """
          select website_to_hebrew(website) "אתר בדיקה",
          avg(ground_truth_rate / speed_test_rate) "יחס מהירות בפועל למהירות בדיקה"
          from valid_tests
          where (ground_truth_rate / speed_test_rate) between 0.01 and 100
          and  true_or_null(is_classic_test)
          and  ground_truth_rate > 0 and speed_test_rate > 0
          and not website = 'atnt'
          group by website_to_hebrew(website)
          order by  "יחס מהירות בפועל למהירות בדיקה"
          ;
      """

    public_servers_query = """
          select website_to_hebrew(website) "אתר בדיקה",
          avg(ground_truth_rate / speed_test_rate) "יחס מהירות בפועל למהירות בדיקה"
          from valid_tests
          where (ground_truth_rate / speed_test_rate) between 0.01 and 100
          and  true_or_null(is_classic_test)
          and  ground_truth_rate > 0 and speed_test_rate > 0
          and is_classic_resource(file_name)
          and not website = 'atnt'
          group by website
          order by  "יחס מהירות בפועל למהירות בדיקה"
          ;
      """

    israel_cache_query = """
    select website_to_hebrew(website) "אתר בדיקה",      
          avg(ground_truth_rate / speed_test_rate) "יחס מהירות בפועל למהירות בדיקה"
          from valid_tests
          where (ground_truth_rate / speed_test_rate) between 0.01 and 100
          and  true_or_null(is_classic_test)
          and  ground_truth_rate > 0 and speed_test_rate > 0
          and file_name = 'israel_cache'
          and not website = 'atnt'
          group by website
          order by  "יחס מהירות בפועל למהירות בדיקה"
      ;
      """

    cur = engine.cursor()
    lables = "כל המקורות", "קבצים ציבוריים", "שרת מטמון ישראל"
    for label, query in zip(
            lables,
        [all_resources_query, public_servers_query, israel_cache_query]):
        print(f"handeling: {label}")
        cur.execute(query)
        all_resources = list(cur.fetchall())
        plot.scatter(x=[normalize_hebrew(x[0]) for x in all_resources],
                     y=[x[1] for x in all_resources],
                     label=normalize_hebrew(label))

    first_on_x, last_on_x = normalize_hebrew('הוט'), normalize_hebrew('גוגל')
    plot.hlines(y=1,
                xmin=first_on_x,
                xmax=last_on_x,
                colors='aqua',
                linestyles='dotted',
                lw=2,
                label=normalize_hebrew('חיזוי מדויק'))

    plot.legend(loc="best")
    plot.ylabel(normalize_hebrew('יחס בדיקת אתר למהירות בפועל'))
    snapshots_path = Path("question_snapshots") / Path(
        'ground_truth_violin_plots')
    if not os.path.exists(snapshots_path):
        os.makedirs(snapshots_path)

    snapshots_path = create_snapshot_path("website_comparison_scatter")

    fig_path = snapshots_path / Path("השוואת ממוצעי אתרי בדיקה" + ".png")
    plot.savefig(fig_path)
    print(f"saving {fig_path}")
    plot.show()

示例#6

0

显示文件

import math
import os
import numpy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot

from pathlib import Path
from collections import defaultdict
from typing import List, Tuple
from utils import get_engine
from psycopg2.extensions import cursor
from enum import Enum
from utils import normalize_hebrew

TEST_RESULT_HEBREW = normalize_hebrew("תוצאת דגימה")

VENDOR_HEBREW = normalize_hebrew("ספק")


class Website(Enum):
    hot = "hot"
    google = "google"
    bezeq = "bezeq"
    ookla = "ookla"
    netflix = "netflix"


class VendorUsers(Enum):
    Bezeq = "bezeq_users"
    Hot = "hot_users"

示例#7

0

显示文件

    ax.legend(facecolor='white')
    plt.savefig(dump_path)
    plt.show()
    print(f"r: {r_value}, r squared: {r_value ** 2}, p-value: {p_value}")


if __name__ == "__main__":
    # Student's T-Test Logic
    #
    # vendor_capacity_data = get_capacity_data()
    # t_score, p_value = ttest_ind([x[1] for x in vendor_capacity_data["hot"]],
    #                              [x[1] for x in vendor_capacity_data["bezeq"]])
    #
    # print("t test score:", t_score, "p value:", p_value)

    # Linear Regression Logic
    household_stats = get_household_data()
    x_values = np.array([x[NUM_USERS] for x in household_stats])
    y_values = np.array([y[CAPACITY] for y in household_stats])

    linear_regression(
        x_values=x_values,
        y_values=y_values,
        data_point_label=normalize_hebrew('משתמשים'),
        line_label=normalize_hebrew('קו רגרסיה'),
        xlabel=normalize_hebrew('מספר נפשות בבית'),
        ylabel=normalize_hebrew('יחס מהירות לחבילה'),
        dump_path=f"household_size_and_internet_capacity_correlation",
    )
    quit()