def plot_encoding_B_functionality(all_files, changing_value, scale):
    plot_params = []
    for X_val, X_files in all_files:
        current_X_points = [[],[]]
        for stats_file in X_files:
            stats = words_stats.load_stats(stats_file)
            num_words = stats.L + stats.D + stats.F
            if changing_value == 'F':
                current_X_points[0].append(stats.F)
            elif changing_value == 'L':
                current_X_points[0].append(stats.L)



            # all encodings except those for ['wow', 'wow',...] and their likes
            flow = [x for x in stats.encoding_flow if x[4] >= (num_words / 2)]

            # successful_encoding_flow = [x for x in flow if x[2]]
            # num_of_encoding_attempts = len(flow)
            #
            # if len(successful_encoding_flow) >= 5:
            #     expected_links_needed_per_word = num_of_encoding_attempts / len(successful_encoding_flow)
            # else:
            #     # flow = [x for x in stats.encoding_flow if x[4] > 1]
            #     expected_links_needed_per_word = find_expected_num_of_links(stats, flow)
            expected_links_needed_per_word = min(find_expected_num_of_links(stats, flow),1000000)
            current_X_points[1].append(expected_links_needed_per_word)

        print 'current_X_points[0]: ', current_X_points[0]
        print 'current_X_points[1]: ', current_X_points[1]
        plot_params.append((current_X_points[0], current_X_points[1], X_val))

    plt.figure(figsize=figsize)
    for x_params in plot_params:
        line_properties = lines_styles_params[x_params[2]]

        x_axis_values = x_params[0]
        y_axis_values = x_params[1]

        x_narray = np.array(x_axis_values)
        y_narray = np.array(y_axis_values)
        x_line_space = np.linspace(x_narray.min(), x_narray.max(), 40)
        f_smooth = interp1d(x_narray, y_narray, kind='slinear')

        plt.plot(x_line_space, f_smooth(x_line_space), line_properties[0], linewidth= line_properties[1], label=line_properties[2])

    plt.ylabel('Expected Number of Links')
    plt.legend(loc=4, frameon=False, prop={'size':14})
    plt.axis(scale)
    if changing_value == 'F':
        plt.xlabel('F (Number of Function Words)')
        plt.savefig('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_plots/plot_B.pdf', bbox_inches='tight')
    elif changing_value == 'L':
        plt.xlabel('L (Number of Link Words)')
        plt.savefig('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_plots/plot_C.pdf', bbox_inches='tight')
    plt.show()
def create_list_of_links_to_be_manually_inspected(path='/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_a_results/'):
    file_names = [ f for f in listdir(path)]
    for stats_file in file_names:
        stats = words_stats.load_stats(stats_file, path)
        csv_str_list = [x[1] +','+str(x[2])  for x in stats.collected_words[1:]]
        file_name = '/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_a_results_csv/' + stats_file +'.csv'

        with open(file_name, 'w') as f:
            for link_line in csv_str_list:
                f.write('%s\n' % link_line)
def compute_all_download_times(path='/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_a_results/'):
    file_names = [ f for f in listdir(path)]
    times_of_all_files = []
    for i,file_name in enumerate(file_names):
        print 'starting file number: ',i, ' file name:', file_name
        stats = words_stats.load_stats(file_name, path)
        urls = [x[1] for x in stats.collected_words[1:]]

        current_file_times_sum = 0.
        for i,url in enumerate(urls):
            print 'currently at link: ', i
            download_time = time_page(url)
            current_file_times_sum += download_time
        print 'sum is: ', current_file_times_sum
        times_of_all_files.append(current_file_times_sum)

    timing_file_name = '/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/timing.txt'
    with open(timing_file_name, 'w') as f:
        for timing in times_of_all_files:
            f.write('%s\n' % str(timing))
def plot_encoding_D():

    # X_100_file = 'stats_100_1_1_2_0_tweet_1_2015-01-14 19:33:38.758207.pkl'
    # X_250_file = 'stats_256_1_1_2_0_tweet_1_2015-01-17 13:22:15.402834.pkl'
    # X_500_file = 'stats_512_1_1_2_0_tweet_1_2015-01-17 13:09:06.839638.pkl'
    X_100_file = 'stats_100_1_1_2_0_tweet_1_2015-01-17 21:27:54.405155.pkl'
    X_250_file = 'stats_256_1_1_2_0_tweet_1_2015-01-17 22:10:22.532923.pkl'
    X_500_file = 'stats_512_1_1_2_0_tweet_1_2015-01-17 22:56:16.653616.pkl'
    X_750_file = 'stats_749_1_1_2_0_tweet_1_2015-01-14 20:40:46.187508.pkl'

    # X_100_file = 'stats_100_1_1_2_0_tweet_1_2015-01-14 19:33:38.758207.pkl'
    # X_250_file = 'stats_256_1_1_2_0_tweet_1_2015-01-14 19:54:36.261821.pkl'
    # X_500_file = 'stats_512_1_1_2_0_tweet_1_2015-01-14 20:16:07.154431.pkl'
    # X_750_file = 'stats_729_1_1_2_0_tweet_1_2015-01-14 20:40:46.187508.pkl'

    # X_100_file = 'stats_101_1_1_5_0_tweet_1_2015-01-11 13:40:06.319823.pkl'
    # X_250_file = 'stats_256_1_1_5_0_tweet_1_2015-01-11 16:33:32.267943.pkl'
    # X_500_file = 'stats_512_1_1_5_0_tweet_1_2015-01-13 20:01:11.988187.pkl'
    # X_750_file = 'stats_750_1_1_5_0_tweet_1_2015-01-14 00:16:50.589827.pkl'
    # X_750_file = 'stats_750_1_2_4_0_tweet_1_2015-01-14 01:51:00.545645.pkl'

    all_files = [(100,X_100_file),(250,X_250_file),(500,X_500_file),(750,X_750_file)]

    plt.figure(figsize=figsize)
    for X, stats_file in all_files:
        stats = words_stats.load_stats(stats_file)

        chunks_starts = [0]
        for i in range(1,len(stats.encoding_flow)):
            if stats.encoding_flow[i][0] != stats.encoding_flow[i-1][0]:
                chunks_starts.append(i)

        chunks = []
        for i in range(0, len(chunks_starts)-1):
            start = chunks_starts[i]
            stop = chunks_starts[i+1]
            chunks.append(stats.encoding_flow[start:stop])
        chunks.append(stats.encoding_flow[chunks_starts[-1]:])

        num_words = stats.L + stats.D + stats.F
        good_chunks = [chunk for chunk in chunks if chunk[0][4] >= num_words / 2]
        num_links_required_in_successful_chunks = [x[-1][1] for x in good_chunks if x[-1][2]]
        if not num_links_required_in_successful_chunks:
            expected_num_links_required_in_good_chunks = []
            for chunk in good_chunks:
                expected_links_needed = find_expected_num_of_links(stats, chunk)
                expected_num_links_required_in_good_chunks.append(expected_links_needed)
            num_links_required_in_successful_chunks = expected_num_links_required_in_good_chunks

        total_number_of_attempts = float(len(good_chunks))
        cdf = []
        for i in range(int(max(num_links_required_in_successful_chunks)) + 30):
            cdf.append(len([x for x in num_links_required_in_successful_chunks if x <= i]) / total_number_of_attempts)

        plt.plot(cdf, lines_styles_params[X][0], linewidth=lines_styles_params[X][1], label=lines_styles_params[X][2])
    plt.axis([0, 20, 0., 1.1])
    plt.xlabel('Number of Links')
    plt.ylabel('CDF')
    plt.legend(loc='best', frameon=False)
    plt.savefig('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_plots/plot_D.pdf', bbox_inches='tight')
    plt.show()
def plot():

    # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/COPY_752_statsdata.txt_1000_0.txt.pkl')
    # stats2 = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/COPY_750_statsdata.txt_750_0.txt.pkl')
    # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/COPY_70_statsdata.txt_101_0.txt.pkl')
    # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_7_101_2015-01-01 12:35:25.180845.pkl')
    # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_8_94_2015-01-01 14:58:30.815052.pkl')
    # stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_8_750_2015-01-01 23:48:44.081310.pkl')
    stats = load_stats('/Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_7_750_2015-01-01 23:43:10.180217.pkl')

    values = stats.search_stats.values()
    # values += stats2.search_stats.values()

    num_links_required = []
    total_number_of_attempts = 0
    for words_search in values:
        search_end = [x for x in words_search if x[2] == 1.]
        total_number_of_attempts += max(1, len(search_end))
        start_i = 0
        for end in search_end:
            end_i = words_search[start_i:].index(end)
            num_links_required.append(end_i - start_i + 1)
            start_i = end_i + 1

    total_number_of_attempts = float(total_number_of_attempts)
    cdf = []
    for i in range(131):
        cdf.append(len([x for x in num_links_required if x <= i]) / total_number_of_attempts)

    plt.axis([0, 130, 0., 1.1])
    plt.plot(cdf, '-r')
    plt.xlabel('# of links.txt passed to find feasible essence')
    plt.ylabel('CDF')
    plt.grid(True)
    plt.show()


    # x = num_links_required
    # fig = plt.figure()
    # ax = fig.add_subplot(1,1,1)
    # n, bins, patches=ax.hist(x, 30)
    # ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: ('%.2f')%(y/total_number_of_attempts)))
    # ax.set_ylabel('% of links.txt')
    # # ax.set_autoscaley_on(False)
    # plt.ylim(ymin=0,ymax=12.3)      # the 101 / 7 stats /Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/a_stats/stats_7_101_2015-01-01 12:35:25.180845.pkl
    # # plt.ylim(ymin=0,ymax=13.8)    # the 752 / 7 stats /Users/uriklarman/Development/PycharmProjects/no_git/jumping_the_net/resources/COPY_752_statsdata.txt_1000_0.txt.pkl
    # plt.xlabel('# of links.txt passed to find feasible essence')
    # plt.show()

    # plt.axis([0, 130, 0., 1.1])
    # plt.hist(num_links_required, bins=50)
    # plt.grid(True)
    # plt.show()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(num_links_required, weights=np.zeros_like(num_links_required) + 100. / total_number_of_attempts, bins= [x for x in range(0,132,2)])
    # n, bins, patches = ax.hist(num_links_required, bins=100, normed=1, cumulative=0)
    plt.axis([0, 130, 0., 100])
    plt.xlabel('# of links.txt passed to find feasible essence')
    plt.ylabel('% of links.txt')

    plt.show()


    # ax.set_ylim([0,100])
    # ax.set_ylim((0,10))
    # ax.set_xlim((1,20))
    # plt.ylim(ymin=0,ymax=10)
    # num_links_used = sorted([x[2] for x in stats.words_collected])
    # cdf = [0] * (num_links_used[-1] + 1)
    # for i in range(len(num_links_used)):
    #     cdf[num_links_used[i]] += 1
    # for i in range(1,len(cdf)):
    #     cdf[i] += cdf[i-1]
    # for i in range(len(cdf)):
    #     cdf[i] /= float(cdf[-1])
    #
    # cdf += [1.]*30
    #
    # plt.axis([0, 100, 0., 1.1])
    # plt.plot(cdf, '-r')
    # plt.xlabel('# of links.txt passed to find feasible essence')
    # plt.ylabel('CDF')
    # plt.grid(True)
    # plt.show()
    #
    # num_links_used = [x[2] for x in stats.words_collected]
    # plt.axis([0.5, 100, 0, 30])
    # plt.hist(num_links_used,bins=50)
    # plt.xlabel('# of links.txt passed to find feasible essence')
    # plt.ylabel('% of links.txt')
    # plt.grid(True)
    # plt.show()



    uncut_essence_size = []
    percent_of_words_in_uncut_essence = []
    percent_of_words_in_essence = []
    for point_list in stats.search_stats.values():
        for point in point_list:
            uncut_essence_size.append(point[0])
            percent_of_words_in_uncut_essence.append(point[1])
            percent_of_words_in_essence.append(point[2])

    plt.xlabel('size of uncut essence')
    plt.ylabel('words in uncut essence')
    plt.plot(uncut_essence_size, percent_of_words_in_uncut_essence, 'or')
    plt.axis([0, 101, 0, 1.1])
    plt.show()

    plt.xlabel('size of uncut essence')
    plt.ylabel('words in essence')
    plt.plot(uncut_essence_size, percent_of_words_in_essence, 'ob')
    plt.axis([0, 101, 0, 1.1])
    plt.show()