Пример #1
0
def test_correlation_matrix():
    vectors = [xs, ys1, ys2]
    assert correlation_matrix(vectors) == [
        [correlation(xs, xs),
         correlation(xs, ys1),
         correlation(xs, ys2)],
        [correlation(ys1, xs),
         correlation(ys1, ys1),
         correlation(ys1, ys2)],
        [correlation(ys2, xs),
         correlation(ys2, ys1),
         correlation(ys2, ys2)],
    ]
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
    """
    Given two vectors x and y,
    find the least-squares values of alpha and beta
    """
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
    """
    Given two vectors x and y,
    find the least-squares values of alpha and beta
    """
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
    """
    Na podstawie przekazanych wartości treningowych x i y
    znajdź za pomocą metody najmniejszych kwadratów optymalne wartości alpha i beta.
    """
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Пример #5
0
def plot_working_scatter():
    xs = [random_normal() for _ in range(1000)]
    ys1 = [x + random_normal() / 2 for x in xs]
    ys2 = [-x + random_normal() / 2 for x in xs]

    plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
    plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
    plt.xlabel('xs')
    plt.ylabel('ys')
    plt.legend(loc=9)
    plt.title("Very Different Joint Distributions")
    # plt.show()

    plt.savefig('im/working_scatter.png')
    plt.gca().clear()

    from scratch.statistics import correlation

    assert 0.89 < correlation(xs, ys1) < 0.91
    assert -0.91 < correlation(xs, ys2) < -0.89
Пример #6
0
 def correlation_ij(i: int, j: int) -> float:
     return correlation(data[i], data[j])
Пример #7
0
def main():

    xs = [random_normal() for _ in range(1000)]
    ys1 = [x + random_normal() / 2 for x in xs]
    ys2 = [-x + random_normal() / 2 for x in xs]

    plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
    plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
    plt.xlabel('xs')
    plt.ylabel('ys')
    plt.legend(loc=9)
    plt.title("Very Different Joint Distributions")
    # plt.show()

    plt.savefig('im/working_scatter.png')
    plt.gca().clear()

    # I don't know why this is necessary
    plt.gca().clear()
    plt.close()

    assert 0.89 < correlation(xs, ys1) < 0.91
    assert -0.91 < correlation(xs, ys2) < -0.89

    vectors = [xs, ys1, ys2]
    assert correlation_matrix(vectors) == [
        [correlation(xs, xs),
         correlation(xs, ys1),
         correlation(xs, ys2)],
        [correlation(ys1, xs),
         correlation(ys1, ys1),
         correlation(ys1, ys2)],
        [correlation(ys2, xs),
         correlation(ys2, ys1),
         correlation(ys2, ys2)],
    ]

    import random
    from scratch.probability import inverse_normal_cdf

    random.seed(0)

    # uniform between -100 and 100
    uniform = [200 * random.random() - 100 for _ in range(10000)]

    # normal distribution with mean 0, standard deviation 57
    normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)]

    plot_histogram(uniform, 10, "Uniform Histogram")

    plt.savefig('im/working_histogram_uniform.png')
    plt.gca().clear()
    plt.close()

    plot_histogram(normal, 10, "Normal Histogram")

    plt.savefig('im/working_histogram_normal.png')
    plt.gca().clear()

    from scratch.statistics import correlation

    print(correlation(xs, ys1))  # about 0.9
    print(correlation(xs, ys2))  # about -0.9

    from typing import List

    # Just some random data to show off correlation scatterplots
    num_points = 100

    def random_row() -> List[float]:
        row = [0.0, 0, 0, 0]
        row[0] = random_normal()
        row[1] = -5 * row[0] + random_normal()
        row[2] = row[0] + row[1] + 5 * random_normal()
        row[3] = 6 if row[2] > -2 else 0
        return row

    random.seed(0)
    # each row has 4 points, but really we want the columns
    corr_rows = [random_row() for _ in range(num_points)]

    corr_data = [list(col) for col in zip(*corr_rows)]

    # corr_data is a list of four 100-d vectors
    num_vectors = len(corr_data)
    fig, ax = plt.subplots(num_vectors, num_vectors)

    for i in range(num_vectors):
        for j in range(num_vectors):

            # Scatter column_j on the x-axis vs column_i on the y-axis,
            if i != j:
                ax[i][j].scatter(corr_data[j], corr_data[i])

                # unless i == j, in which case show the series name.
            else:
                ax[i][j].annotate("series " + str(i), (0.5, 0.5),
                                  xycoords='axes fraction',
                                  ha="center",
                                  va="center")

            # Then hide axis labels except left and bottom charts
            if i < num_vectors - 1: ax[i][j].xaxis.set_visible(False)
            if j > 0: ax[i][j].yaxis.set_visible(False)

    # Fix the bottom right and top left axis labels, which are wrong because
    # their charts only have text in them
    ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
    ax[0][0].set_ylim(ax[0][1].get_ylim())

    # plt.show()

    plt.savefig('im/working_scatterplot_matrix.png')
    plt.gca().clear()
    plt.close()
    plt.clf()

    import csv

    data: List[StockPrice] = []

    with open("comma_delimited_stock_prices.csv") as f:
        reader = csv.reader(f)
        for row in reader:
            maybe_stock = try_parse_row(row)
            if maybe_stock is None:
                print(f"skipping invalid row: {row}")
            else:
                data.append(maybe_stock)

    from dateutil.parser import parse
    import csv

    with open("stocks.csv", "r") as f:
        reader = csv.DictReader(f)
        rows = [[row['Symbol'], row['Date'], row['Close']] for row in reader]

    # skip header
    maybe_data = [try_parse_row(row) for row in rows]

    # Make sure they all loaded successfully:
    assert maybe_data
    assert all(sp is not None for sp in maybe_data)

    # This is just to make mypy happy
    data = [sp for sp in maybe_data if sp is not None]

    max_aapl_price = max(stock_price.closing_price for stock_price in data
                         if stock_price.symbol == "AAPL")

    from collections import defaultdict

    max_prices: Dict[str, float] = defaultdict(lambda: float('-inf'))

    for sp in data:
        symbol, closing_price = sp.symbol, sp.closing_price
        if closing_price > max_prices[symbol]:
            max_prices[symbol] = closing_price

    from typing import List
    from collections import defaultdict

    # Collect the prices by symbol
    prices: Dict[str, List[StockPrice]] = defaultdict(list)

    for sp in data:
        prices[sp.symbol].append(sp)

    # Order the prices by date
    prices = {
        symbol: sorted(symbol_prices)
        for symbol, symbol_prices in prices.items()
    }

    all_changes = [
        change for symbol_prices in prices.values()
        for change in day_over_day_changes(symbol_prices)
    ]

    max_change = max(all_changes, key=lambda change: change.pct_change)
    # see, e.g. http://news.cnet.com/2100-1001-202143.html
    assert max_change.symbol == 'AAPL'
    assert max_change.date == datetime.date(1997, 8, 6)
    assert 0.33 < max_change.pct_change < 0.34

    min_change = min(all_changes, key=lambda change: change.pct_change)
    # see, e.g. http://money.cnn.com/2000/09/29/markets/techwrap/
    assert min_change.symbol == 'AAPL'
    assert min_change.date == datetime.date(2000, 9, 29)
    assert -0.52 < min_change.pct_change < -0.51

    changes_by_month: List[DailyChange] = {month: [] for month in range(1, 13)}

    for change in all_changes:
        changes_by_month[change.date.month].append(change)

    avg_daily_change = {
        month: sum(change.pct_change for change in changes) / len(changes)
        for month, changes in changes_by_month.items()
    }

    # October is the best month
    assert avg_daily_change[10] == max(avg_daily_change.values())

    from scratch.linear_algebra import distance

    a_to_b = distance([63, 150], [67, 160])  # 10.77
    a_to_c = distance([63, 150], [70, 171])  # 22.14
    b_to_c = distance([67, 160], [70, 171])  # 11.40

    a_to_b = distance([160, 150], [170.2, 160])  # 14.28
    a_to_c = distance([160, 150], [177.8, 171])  # 27.53
    b_to_c = distance([170.2, 160], [177.8, 171])  # 13.37

    from typing import List

    def primes_up_to(n: int) -> List[int]:
        primes = [2]

        with tqdm.trange(3, n) as t:
            for i in t:
                # i is prime if no smaller prime divides it.
                i_is_prime = not any(i % p == 0 for p in primes)
                if i_is_prime:
                    primes.append(i)

                t.set_description(f"{len(primes)} primes")

        return primes

    my_primes = primes_up_to(100_000)

    de_meaned = de_mean(pca_data)
    fpc = first_principal_component(de_meaned)
    assert 0.923 < fpc[0] < 0.925
    assert 0.382 < fpc[1] < 0.384
def least_squares_fit(x, y):
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
def least_squares_fit(x, y):
    """given training values for x and y, find the least-squares values of alpha and beta"""
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Пример #10
0
def main():

    # I don't know why this is necessary
    plt.gca().clear()
    plt.close()

    import random
    from scratch.probability import inverse_normal_cdf

    random.seed(0)

    # uniform between -100 and 100
    uniform = [200 * random.random() - 100 for _ in range(10000)]

    # normal distribution with mean 0, standard deviation 57
    normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)]

    plot_histogram(uniform, 10, "Uniform Histogram")

    plt.savefig('im/working_histogram_uniform.png')
    plt.gca().clear()
    plt.close()

    plot_histogram(normal, 10, "Normal Histogram")

    plt.savefig('im/working_histogram_normal.png')
    plt.gca().clear()

    from scratch.statistics import correlation

    print(correlation(xs, ys1))  # about 0.9
    print(correlation(xs, ys2))  # about -0.9

    from typing import List

    # Just some random data to show off correlation scatterplots
    num_points = 100

    def random_row() -> List[float]:
        row = [0.0, 0, 0, 0]
        row[0] = random_normal()
        row[1] = -5 * row[0] + random_normal()
        row[2] = row[0] + row[1] + 5 * random_normal()
        row[3] = 6 if row[2] > -2 else 0
        return row

    random.seed(0)
    # each row has 4 points, but really we want the columns
    corr_rows = [random_row() for _ in range(num_points)]

    corr_data = [list(col) for col in zip(*corr_rows)]

    # corr_data is a list of four 100-d vectors
    num_vectors = len(corr_data)
    fig, ax = plt.subplots(num_vectors, num_vectors)

    for i in range(num_vectors):
        for j in range(num_vectors):

            # Scatter column_j on the x-axis vs column_i on the y-axis,
            if i != j:
                ax[i][j].scatter(corr_data[j], corr_data[i])

                # unless i == j, in which case show the series name.
            else:
                ax[i][j].annotate("series " + str(i), (0.5, 0.5),
                                  xycoords='axes fraction',
                                  ha="center",
                                  va="center")

            # Then hide axis labels except left and bottom charts
            if i < num_vectors - 1: ax[i][j].xaxis.set_visible(False)
            if j > 0: ax[i][j].yaxis.set_visible(False)

    # Fix the bottom right and top left axis labels, which are wrong because
    # their charts only have text in them
    ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
    ax[0][0].set_ylim(ax[0][1].get_ylim())

    # plt.show()

    plt.savefig('im/working_scatterplot_matrix.png')
    plt.gca().clear()
    plt.close()
    plt.clf()

    import csv

    data: List[StockPrice] = []

    with open("comma_delimited_stock_prices.csv") as f:
        reader = csv.reader(f)
        for row in reader:
            maybe_stock = try_parse_row(row)
            if maybe_stock is None:
                print(f"skipping invalid row: {row}")
            else:
                data.append(maybe_stock)

    from typing import List

    def primes_up_to(n: int) -> List[int]:
        primes = [2]

        with tqdm.trange(3, n) as t:
            for i in t:
                # i is prime if no smaller prime divides it.
                i_is_prime = not any(i % p == 0 for p in primes)
                if i_is_prime:
                    primes.append(i)

                t.set_description(f"{len(primes)} primes")

        return primes

    my_primes = primes_up_to(100_000)

    de_meaned = de_mean(pca_data)
    fpc = first_principal_component(de_meaned)
    assert 0.923 < fpc[0] < 0.925
    assert 0.382 < fpc[1] < 0.384
Пример #11
0
ys2 = [-x + random_normal() / 2 for x in xs]

plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.title("Very Different Joint Distributions")
# plt.show()

plt.savefig('im/working_scatter.png')
plt.gca().clear()

from scratch.statistics import correlation

assert 0.89 < correlation(xs, ys1) < 0.91
assert -0.91 < correlation(xs, ys2) < -0.89

from scratch.linear_algebra import Matrix, Vector, make_matrix


def correlation_matrix(data: List[Vector]) -> Matrix:
    """
    Returns the len(data) x len(data) matrix whose (i, j)-th entry
    is the correlation between data[i] and data[j]
    """
    def correlation_ij(i: int, j: int) -> float:
        return correlation(data[i], data[j])

    return make_matrix(len(data), len(data), correlation_ij)
assert stat.quantile(num_friends, 0.10) == 1
assert stat.quantile(num_friends, 0.25) == 3
assert stat.quantile(num_friends, 0.75) == 9
assert stat.quantile(num_friends, 0.90) == 13

assert set(stat.mode(num_friends)) == {1, 6}

assert stat.data_range(num_friends) == 99

assert 81.54 < stat.variance(num_friends) < 81.55

assert 9.02 < stat.standard_deviation(num_friends) < 9.04

assert stat.interquartile_range(num_friends) == 6

assert 22.42 < stat.covariance(num_friends, daily_minutes) < 22.43
assert 22.42 / 60 < stat.covariance(num_friends, daily_hours) < 22.43 / 60

assert 0.24 < stat.correlation(num_friends, daily_minutes) < 0.25
assert 0.24 < stat.correlation(num_friends, daily_hours) < 0.25

outlier = num_friends.index(100)

num_friends_good = [x for i, x in enumerate(num_friends) if i != outlier]

daily_minutes_good = [x for i, x in enumerate(daily_minutes) if i != outlier]

daily_hours_good = [m / 60 for m in daily_minutes_good]

assert 0.57 < stat.correlation(num_friends_good, daily_hours_good) < 0.58
 def correlation_ij(i: int, j: int) -> float:
     return correlation(data[i], data[j])
def main():

    # I don't know why this is necessary
    plt.gca().clear()
    plt.close()

    import random
    from scratch.probability import inverse_normal_cdf

    random.seed(0)

    # uniform between -100 and 100
    uniform = [200 * random.random() - 100 for _ in range(10000)]

    # normal distribution with mean 0, standard deviation 57
    normal = [57 * inverse_normal_cdf(random.random())
              for _ in range(10000)]

    plot_histogram(uniform, 10, "Uniform Histogram")



    plt.savefig('im/working_histogram_uniform.png')
    plt.gca().clear()
    plt.close()

    plot_histogram(normal, 10, "Normal Histogram")


    plt.savefig('im/working_histogram_normal.png')
    plt.gca().clear()

    from scratch.statistics import correlation

    print(correlation(xs, ys1))      # about 0.9
    print(correlation(xs, ys2))      # about -0.9



    from typing import List

    # Just some random data to show off correlation scatterplots
    num_points = 100

    def random_row() -> List[float]:
       row = [0.0, 0, 0, 0]
       row[0] = random_normal()
       row[1] = -5 * row[0] + random_normal()
       row[2] = row[0] + row[1] + 5 * random_normal()
       row[3] = 6 if row[2] > -2 else 0
       return row

    random.seed(0)
    # each row has 4 points, but really we want the columns
    corr_rows = [random_row() for _ in range(num_points)]

    corr_data = [list(col) for col in zip(*corr_rows)]

    # corr_data is a list of four 100-d vectors
    num_vectors = len(corr_data)
    fig, ax = plt.subplots(num_vectors, num_vectors)

    for i in range(num_vectors):
        for j in range(num_vectors):

            # Scatter column_j on the x-axis vs column_i on the y-axis,
            if i != j: ax[i][j].scatter(corr_data[j], corr_data[i])

            # unless i == j, in which case show the series name.
            else: ax[i][j].annotate("series " + str(i), (0.5, 0.5),
                                    xycoords='axes fraction',
                                    ha="center", va="center")

            # Then hide axis labels except left and bottom charts
            if i < num_vectors - 1: ax[i][j].xaxis.set_visible(False)
            if j > 0: ax[i][j].yaxis.set_visible(False)

    # Fix the bottom right and top left axis labels, which are wrong because
    # their charts only have text in them
    ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
    ax[0][0].set_ylim(ax[0][1].get_ylim())

    # plt.show()



    plt.savefig('im/working_scatterplot_matrix.png')
    plt.gca().clear()
    plt.close()
    plt.clf()

    import csv

    data: List[StockPrice] = []

    with open("comma_delimited_stock_prices.csv") as f:
        reader = csv.reader(f)
        for row in reader:
            maybe_stock = try_parse_row(row)
            if maybe_stock is None:
                print(f"skipping invalid row: {row}")
            else:
                data.append(maybe_stock)

    from typing import List

    def primes_up_to(n: int) -> List[int]:
        primes = [2]

        with tqdm.trange(3, n) as t:
            for i in t:
                # i is prime if no smaller prime divides it.
                i_is_prime = not any(i % p == 0 for p in primes)
                if i_is_prime:
                    primes.append(i)

                t.set_description(f"{len(primes)} primes")

        return primes

    my_primes = primes_up_to(100_000)



    de_meaned = de_mean(pca_data)
    fpc = first_principal_component(de_meaned)
    assert 0.923 < fpc[0] < 0.925
    assert 0.382 < fpc[1] < 0.384
plt.scatter(xs, ys2, marker='.', color='gray',  label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.title("Very Different Joint Distributions")
# plt.show()


plt.savefig('im/working_scatter.png')
plt.gca().clear()


from scratch.statistics import correlation


assert 0.89 < correlation(xs, ys1) < 0.91
assert -0.91 < correlation(xs, ys2) < -0.89

from scratch.linear_algebra import Matrix, Vector, make_matrix

def correlation_matrix(data: List[Vector]) -> Matrix:
    """
    Returns the len(data) x len(data) matrix whose (i, j)-th entry
    is the correlation between data[i] and data[j]
    """
    def correlation_ij(i: int, j: int) -> float:
        return correlation(data[i], data[j])

    return make_matrix(len(data), len(data), correlation_ij)