def plot_views_cits_corr():
    input_cits = open('data/series_cits.json', 'r')
    series_cits = json.load(input_cits)
    months_cits = series_cits['months']
    months_cits = [
        int(m.split('-')[0]) + int(m.split('-')[1]) / 12 for m in months_cits
    ]

    data = read_file.load_data()
    data = read_file.filter_outliers(data)

    corrs = []
    for i, _, _, xs, ys, _ in data:
        try:
            idx_begin = months_cits.index(xs[0])
            idx_end = months_cits.index(xs[-1])
            x = series_cits['data'][i]['citations'][idx_begin:idx_end + 1]
            x = np.diff(x)
            y = np.diff(ys)
            if np.count_nonzero(x[0] == x) == len(x) or np.count_nonzero(
                    y[0] == y) == len(y):
                continue

            if np.count_nonzero(x) > len(x) / 2 and np.count_nonzero(
                    y) > len(y) / 2:
                corr = pearsonr(x, y)[0]
                corrs.append(corr)
        except:
            pass

    plt.hist(corrs, bins=100)
    plt.title('correlação entre número de visualizações e número de citações')
    plt.savefig('corr_views_cits.pdf')
def get_dois(n, filename):
    data = read_file.load_data(filename)
    data = read_file.filter_outliers(data)

    dois = []
    for i, s, b, xs, ys, p in data:
        if len(s) == n:
            dois.append(i)
    return dois
def get_data_by_number_segm():
    data = read_file.load_data()
    data = read_file.filter_outliers(data)
    freq_delta_t = defaultdict(lambda: [])
    freq_views = defaultdict(lambda: [])
    for sample in data:
        N = len(sample[1])
        delta_t = sample[3][-1] - sample[3][0]
        views = sample[-2][-1]
        freq_delta_t[N].append(delta_t)
        freq_views[N].append(views)

    return freq_delta_t, freq_views
def get_all_data():
    data = read_file.load_data()
    data = read_file.filter_outliers(data)
    # freq1 = defaultdict(lambda:[])
    freq = []
    for sample in data:
        N = len(sample[1])
        delta_t = sample[3][-1] - sample[3][0]
        # freq1[N].append((delta_t,sample[-2][-1]))
        views = sample[-2][-1]
        if views == 0:
            continue
        freq.append((delta_t, views))
    return freq
def plt_views(dois, labels):
    data = read_file.load_data()
    data = read_file.filter_outliers(data)

    views = []
    for i, s, b, xs, ys, p in data:
        if i in dois:
            views.append(ys[-1])
    views = np.asarray(views)

    values = []
    unique = np.unique(labels)
    fig, axes = plt.subplots(1, 3, sharex=True, sharey=True, figsize=(9, 3))

    for u in unique:
        if u == -1:
            continue
        axes[u].hist(views[labels == u], label=u, bins=30)
        axes[u].legend()

    plt.savefig('views_dist_labels.pdf')
import json
import numpy as np
from read_file import load_data, filter_outliers
sources = ['clusters\\clusters\\clusters_ind_single_0.50_2.txt',
                   'clusters\\clusters\\clusters_ind_single_0.35_3.txt',
                   'clusters\\clusters\\clusters_ind_single_0.47_4.txt',
                   'clusters\\clusters\\clusters_ind_single_0.56_5.txt']

data = load_data()
data = filter_outliers(data)
for N, source in zip([2, 3, 4, 5], sources):
    labels = np.loadtxt(source, dtype=np.int).tolist()
    unique, counts = np.unique(labels, return_counts=True)
    unique = unique[counts >= 10]
    counts = counts[counts >= 10]
    unique_idxs = np.argsort(counts)[-3:]
    unique = unique[unique_idxs].tolist()
    labels = [unique.index(l) if l in unique else -1 for l in labels]

    dois = []
    for i, s, b, xs, ys, p in data:
        if len(s) == N:
            dois.append(i)
    print(len(dois), len(labels))
    doi2cluster = dict(zip(dois, labels))
    str_json = json.dumps(doi2cluster)
    out = open('doi2cluster_%d_3.json' % N, 'w')
    out.write(str_json)
    out.close()
    sources = [
        'clusters\\clusters\\clusters_ind_single_0.50_2.txt',
        'clusters\\clusters\\clusters_ind_single_0.35_3.txt',
        'clusters\\clusters\\clusters_ind_single_0.47_4.txt',
        'clusters\\clusters\\clusters_ind_single_0.56_5.txt'
    ]
    labels3 = np.loadtxt(sources[1], dtype=np.int)
    unique, count = np.unique(labels3, return_counts=True)
    unique = unique[count >= 10]
    count = count[count >= 10]
    unique_idxs = np.argsort(count)[-3:]
    unique = unique[unique_idxs].tolist()
    labels3 = [unique.index(l) if l in unique else -1 for l in labels3]

    data = read_file.load_data()
    data = read_file.filter_outliers(data)
    # dois = {2: [], 3: [], 4: [], 5: []}
    # slopes = []
    # intervals = []
    # for i, s, b, xs, ys, p in data:
    #     dois[len(s)].append(i)
    #
    # print('1y')
    # plt_cits(dois[3], labels3, 1)
    # print('2y')
    # plt_cits(dois[3], labels3, 2)
    # print('3y')
    # plt_cits(dois[3], labels3, 5)

    # # pegar os labels
    # # ver a distribuição dos grupos kk views cits tweets