def fitSkewedNormal(rand, ax, label, alpha_hist=0.4, color_line='r'):

    ax.set_title(label, fontsize=18)

    param = stats.skewnorm.fit(rand)
    x_rand, p_rand = getDistribution(rand)
    pdf_fitted = stats.skewnorm.pdf(x_rand,
                                    param[0],
                                    loc=param[1],
                                    scale=param[2])

    mean = stats.skewnorm.mean(param[0], loc=param[1], scale=param[2])
    maxx = x_rand[pdf_fitted.tolist().index(max(pdf_fitted))]

    counts, bins, bars = ax.hist(rand,
                                 normed=True,
                                 bins=np.linspace(min(x_rand), max(x_rand),
                                                  25),
                                 alpha=alpha_hist)
    sk_results_norm = stats.kstest(
        np.asarray(pdf_fitted), lambda x: stats.skewnorm.cdf(
            x_rand, param[0], loc=param[1], scale=param[
                2]))  # stats.ks_2samp(np.cumsum(p_rand), np.cumsu
    ax.plot(x_rand,
            pdf_fitted,
            '-',
            color=color_line,
            linewidth=3,
            label='$\\mu$=' + str(round(mean, 2)) + ', $\\mu^{*}$=' +
            str(maxx) + '\n$D$=' + str(round(sk_results_norm[0], 2)) +
            ', $p$=' + str(round(sk_results_norm[1], 2)))

    return mean, sk_results_norm[0], sk_results_norm[1], param[0], param[
        1], param[2]
def fitPowerLaw(rand, ax, label):

    ax.set_title(label, fontsize=18)

    # histogram
    print 'Fitting lognormal...'
    x_rand, p_rand = getDistribution(rand)
    counts, bins, bars = ax.hist(
        rand,
        normed=True,
        bins=10**np.linspace(np.log10(min(x_rand)), np.log10(max(x_rand)), 15),
        log=True,
        alpha=0.0)  #,   histtype='step', linewidth = 0)
    ax.plot((bins[1:] + bins[:-1]) / 2,
            counts,
            's-',
            color='royalblue',
            alpha=0.5,
            markersize=12,
            linewidth=2)

    # get the lognormal
    param = stats.lognorm.fit(rand)
    pdf_fitted = stats.lognorm.pdf(x_rand,
                                   param[0],
                                   loc=param[1],
                                   scale=param[2])  #
    mu = np.log(param[2])
    sigma = param[0]
    sk_results_norm = stats.kstest(
        np.asarray(pdf_fitted), lambda x: stats.lognorm.cdf(
            x_rand, param[0], loc=param[1], scale=param[
                2]))  # stats.ks_2samp(np.cumsum(p_rand), np.cumsu
    ax.plot(x_rand,
            pdf_fitted,
            'k-',
            linewidth=4,
            label='$\\mu$=' + str(round(mu, 2)) + ' $\\sigma$=' +
            str(round(sigma, 2)) + ', $D$=' +
            str(round(sk_results_norm[0], 2)))

    # fit and plot the powerlaw
    results = powerlaw.Fit(rand, xmin=min(x_rand), fit_method='KS')
    alpha = results.power_law.alpha
    xmin = results.power_law.xmin
    D = results.power_law.KS()

    results.power_law.plot_pdf(color='r',
                               ax=ax,
                               linestyle='-',
                               linewidth=4,
                               label='$\\alpha$= ' + str(round(alpha, 2)) +
                               ', $x_{min}$=' + str(round(xmin, 2)) +
                               '\n$D$=' + str(round(D, 2)))

    ax.set_ylim([min(counts), 1.1])
    ax.set_xlim([min(x_rand), max(bins)])

    return alpha, xmin, D
def plot_exponent(series, ax, num_of_bins, color, label):

    x_series, p_series = getDistribution(series, True)
    bx_series, bp_series, bp_series_err = getBinnedDistribution(
        x_series, p_series, num_of_bins)
    ax.plot(x_series, p_series, color + 'o', alpha=0.1, markersize=7)
    ax.errorbar((bx_series[1:] + bx_series[:-1]) / 2,
                bp_series,
                yerr=bp_series_err,
                color=color,
                fmt='o-',
                alpha=0.9,
                capsize=3,
                elinewidth=1,
                linewidth=3,
                label=label)
def fitPowerLaw(rand, ax, label):

    ax.set_title(label, fontsize=18)

    # get the scatterplot
    x_rand, p_rand = getDistribution(rand)

    # fit and plot the lognormal
    print 'Fitting lognormal...'
    counts, bins, bars = ax.hist(
        rand,
        normed=True,
        bins=10**np.linspace(np.log10(min(x_rand)), np.log10(max(x_rand)), 15),
        log=True,
        alpha=0.0)  #,   histtype='step', linewidth = 0)
    ax.plot((bins[1:] + bins[:-1]) / 2,
            counts,
            's-',
            color='royalblue',
            alpha=0.5,
            markersize=12,
            linewidth=2)

    # fit and plot the powerlaw
    results = powerlaw.Fit(rand, xmin=min(x_rand), fit_method='KS')
    alpha = results.power_law.alpha
    xmin = results.power_law.xmin
    D = results.power_law.KS()

    results.power_law.plot_pdf(marker='o',
                               color='r',
                               ax=ax,
                               linestyle='-',
                               linewidth=3,
                               label='$\\alpha$= ' + str(round(alpha, 2)) +
                               ', $x_{min}$=' + str(round(xmin, 2)) +
                               '\n$D$=' + str(round(D, 2)))

    ax.set_ylim([min(counts), 1.1])
    ax.set_xlim([min(x_rand), max(bins)])

    return alpha, xmin, D
Exemplo n.º 5
0
def fitSkewedNormal(filename, ax, label, alpha_hist=0.2, color_line='r'):

    rand = np.asarray([float(line.strip()) for line in open(filename)])

    print 'Fitting normal...'
    param = stats.skewnorm.fit(rand)
    x_rand, p_rand = getDistribution(rand)
    pdf_fitted = stats.skewnorm.pdf(x_rand,
                                    param[0],
                                    loc=param[1],
                                    scale=param[2])

    mean = stats.skewnorm.mean(param[0], loc=param[1], scale=param[2])
    maxx = str(x_rand[pdf_fitted.tolist().index(max(pdf_fitted))])
    counts, bins, bars = ax.hist(rand,
                                 bins=np.linspace(min(x_rand), max(x_rand),
                                                  25),
                                 normed=True,
                                 alpha=alpha_hist)

    sk_results = stats.kstest(
        np.asarray(pdf_fitted), lambda x: stats.skewnorm.cdf(
            x_rand, param[0], loc=param[1], scale=param[2]))
    ax.plot(x_rand,
            pdf_fitted,
            '-',
            color=color_line,
            linewidth=3,
            label='$\\mu$=' + str(round(mean, 2)) + ', $\\mu^{*}$=' + maxx +
            '\n$D$=' + str(round(sk_results[0], 2)) + ', $p$=' +
            str(round(sk_results[1], 2)))
    ax.set_title(label, fontsize=18)

    ax.set_yticks(np.linspace(0, max(counts), 5))
    ax.set_yticklabels([
        str(int(100 * y)) + '%'
        for y in np.linspace(0, 1.05 * max(counts) / (sum(counts)), 5)
    ])
Exemplo n.º 6
0
def fitAndStatsTransformedNormal(filename,
                                 ax,
                                 label,
                                 outfolder,
                                 name,
                                 statfile,
                                 filterparam,
                                 alpha_hist=0.2,
                                 color_line='r'):

    if 'log_Q' in name:
        rand = np.asarray([
            float(line.strip().split('\t')[1]) for line in open(filename)
            if len(line.strip().split('\t')) > 1
        ])
    else:
        rand = np.asarray([float(line.strip()) for line in open(filename)])

    mmin = min(rand)
    rand = [math.log(rr - mmin + 1) for rr in rand]

    print 'Fitting normal...'
    param = stats.norm.fit(rand)

    print param
    x_rand, p_rand = getDistribution(rand)
    pdf_fitted = stats.norm.pdf(x_rand, loc=param[0], scale=param[1])

    mean, variance = stats.norm.stats(loc=param[0],
                                      scale=param[1],
                                      moments='mv')
    maxx = x_rand[pdf_fitted.tolist().index(max(pdf_fitted))]
    counts, bins, bars = ax.hist(rand,
                                 bins=np.linspace(min(x_rand), max(x_rand),
                                                  25),
                                 normed=True,
                                 alpha=alpha_hist)
    D = stats.kstest(
        np.asarray(pdf_fitted),
        lambda x: stats.norm.cdf(x_rand, loc=param[0], scale=param[1]))[0]

    counts, bins, bars = ax.hist(rand,
                                 bins=np.linspace(min(x_rand), max(x_rand),
                                                  25),
                                 normed=True,
                                 alpha=alpha_hist)
    ax.plot(x_rand,
            pdf_fitted,
            '-',
            color=color_line,
            linewidth=3,
            label='D = ' + str(D) + '\nvarQ = ' + str(variance))
    ax.set_title('Normal fit')
    legend = ax.legend(loc='left', shadow=True, fontsize=20)

    print 'NORM', mean, variance

    extra = ''
    if 'release' in filename:
        extra = '_release-max'

    write_row(
        outfolder + '/' + label + '_' + name + '_tnorm_original_fit' + extra +
        '.dat', [
            str(x_rand[i]) + '\t' + str(pdf_fitted[i])
            for i in range(len(x_rand))
        ])
    write_row(
        outfolder + '/' + label + '_' + name + '_tnorm_mean_centered_fit' +
        extra + '.dat', [
            str(x_rand[i] - mean) + '\t' + str(pdf_fitted[i])
            for i in range(len(x_rand))
        ])
    write_row(
        outfolder + '/' + label + '_' + name + '_tnorm_peak_centered_fit' +
        extra + '.dat', [
            str(x_rand[i] - maxx) + '\t' + str(pdf_fitted[i])
            for i in range(len(x_rand))
        ])
    write_row(
        outfolder + '/' + label + '_' + name + '_tnorm_mean_centered_fit' +
        extra + 'sample' + '.dat', [
            str(x_rand[i] - mean) + '\t' + str(pdf_fitted[i])
            for i in range(len(x_rand))[0::filterparam]
        ])
    write_row(
        outfolder + '/' + label + '_' + name + '_tnorm_peak_centered_fit' +
        extra + 'sample' + '.dat', [
            str(x_rand[i] - maxx) + '\t' + str(pdf_fitted[i])
            for i in range(len(x_rand))[0::filterparam]
        ])
    write_row(
        outfolder + '/' + label + '_' + name + '_tnorm_original_hist_' +
        extra + '.dat', rand)

    fout = open(statfile, 'a')
    fout.write(label + '\t' + name + '\t' + str(D) + '\t' + str(mean) + '\t' +
               str(variance) + '\n')
    fout.close()
Exemplo n.º 7
0
def fitLognormal(filename,
                 ax,
                 label='',
                 out_folder='',
                 name='',
                 cutoff=-sys.maxint,
                 writeout=True,
                 noise=False,
                 norm='no'):

    rand = []

    if 'log_p' in name:
        rand = np.asarray([
            math.exp(float(line.strip())) + noise for line in open(filename)
            if float(line.strip()) > cutoff
        ])
    elif 'log_Q' in name:
        rand = np.asarray([
            math.exp(float(line.strip().split('\t')[1]))
            for line in open(filename) if len(line.strip().split('\t')) > 1
            and float(line.strip().split('\t')[1]) > cutoff
        ])
    elif noise:
        rand = np.asarray([
            float(line.strip()) + random.random() for line in open(filename)
            if float(line.strip()) > cutoff
        ])
    else:
        rand = np.asarray([
            float(line.strip()) + random.random() for line in open(filename)
            if float(line.strip()) > cutoff
        ])

    x_rand, p_rand = getDistribution(rand)

    # histogram
    nbins = 20
    counts, bins, bars = ax.hist(rand,
                                 normed=True,
                                 bins=10**np.linspace(np.log10(min(x_rand)),
                                                      np.log10(max(x_rand)),
                                                      nbins),
                                 log=True,
                                 alpha=0.0,
                                 cumulative=1)
    ax.plot((bins[1:] + bins[:-1]) / 2,
            counts,
            's-',
            color='royalblue',
            alpha=0.0,
            markersize=0,
            linewidth=5)
    bins = (bins[1:] + bins[:-1]) / 2

    # fit and plot the powerlaw
    print 'Fit and plot the lognormal...' + label
    p0 = stats.lognorm._fitstart(rand)
    p1 = stats.lognorm.fit(rand, p0[0], loc=p0[1], scale=p0[2])
    param = stats.lognorm.fit(rand, p1[0], loc=p1[1], scale=p1[2])

    ppdf_fitted = stats.lognorm.cdf(x_rand,
                                    param[0],
                                    loc=param[1],
                                    scale=param[2])
    mu = np.log(param[2])
    sigma = param[0]

    ax.set_xlabel(label, fontsize=20)
    ax.set_ylabel('CDF of ' + label, fontsize=20)

    cdf_fitted = stats.lognorm.cdf(x_rand,
                                   param[0],
                                   loc=param[1],
                                   scale=param[2])
    sk_results_norm = stats.ks_2samp(
        cdf_fitted,
        np.cumsum(p_rand))  # stats.ks_2samp(np.cumsum(p_rand), np.cumsu

    print label, '\t', norm, '\t', sk_results_norm[0]

    extra = ''
    if 'release' in filename:
        extra = '_release-max'

    if writeout:
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)

        write_row(
            out_folder + '/' + label + '_' + name + '_lognormal_pdf_' + norm +
            extra + '.dat', [
                str(x_rand[i]) + '\t' + str(ppdf_fitted[i])
                for i in range(len(x_rand))
            ])
        write_row(
            out_folder + '/' + label + '_' + name + '_lognormal_hist_pdf_' +
            str(nbins) + '_' + norm + extra + '.dat',
            [str(bins[i]) + '\t' + str(counts[i]) for i in range(len(counts))])

    return
Exemplo n.º 8
0
def fitPowerLaw(filename,
                ax,
                label='',
                out_folder='',
                name='',
                cutoff=-sys.maxint,
                writeout=True,
                noise=False,
                distancefile=''):

    rand = []

    if 'log_p' in name:
        rand = np.asarray([
            math.exp(float(line.strip())) + noise for line in open(filename)
            if float(line.strip()) > cutoff
        ])
    elif 'log_Q' in name:
        rand = np.asarray([
            math.exp(float(line.strip().split('\t')[1]))
            for line in open(filename) if len(line.strip().split('\t')) > 1
            and float(line.strip().split('\t')[1]) > cutoff
        ])
    elif noise:
        rand = np.asarray([
            float(line.strip()) + random.random() for line in open(filename)
            if float(line.strip()) > cutoff
        ])
    else:
        rand = np.asarray([
            float(line.strip()) + random.random() for line in open(filename)
            if float(line.strip()) > cutoff
        ])

    x_rand, p_rand = getDistribution(rand)
    ax.set_title(label, fontsize=18)

    # histogram
    counts, bins, bars = ax.hist(rand,
                                 normed=True,
                                 bins=10**np.linspace(np.log10(min(x_rand)),
                                                      np.log10(max(x_rand)),
                                                      1000),
                                 log=True,
                                 alpha=0.0,
                                 cumulative=1)
    ax.plot((bins[1:] + bins[:-1]) / 2,
            counts,
            's-',
            color='royalblue',
            alpha=0.7,
            markersize=0,
            linewidth=5)
    bins = (bins[1:] + bins[:-1]) / 2
    ax.set_ylim([min(counts), 1.05 * max(counts)])
    ax.set_xlim([min(x_rand), max(bins)])

    # fit and plot the powerlaw
    print 'Fit and plot the powerlaw...'
    results = powerlaw.Fit(rand, xmin=min(x_rand), fit_method='KS')
    alpha = results.power_law.alpha
    D = results.power_law.KS()
    parassms = results.power_law.plot_cdf(color='r',
                                          ax=ax,
                                          linestyle='-',
                                          linewidth=3,
                                          label='$\\alpha$= ' +
                                          str(round(alpha, 2)) + ', $D$=' +
                                          str(round(D, 2)))

    # fit and plot the powerlaw
    print 'Fit and plot the lognormal...' + label
    p0 = stats.lognorm._fitstart(rand)
    p1 = stats.lognorm.fit(rand, p0[0], loc=p0[1], scale=p0[2])
    param = stats.lognorm.fit(rand, p1[0], loc=p1[1], scale=p1[2])

    pdf_fitted = stats.lognorm.cdf(x_rand,
                                   param[0],
                                   loc=param[1],
                                   scale=param[2])
    ppdf_fitted = stats.lognorm.pdf(x_rand,
                                    param[0],
                                    loc=param[1],
                                    scale=param[2])
    mu = np.log(param[2])
    sigma = param[0]

    sk_results_norm = stats.ks_2samp(
        pdf_fitted,
        np.cumsum(p_rand))  # stats.ks_2samp(np.cumsum(p_rand), np.cumsu
    ax.plot(x_rand,
            pdf_fitted,
            'k-',
            linewidth=4,
            label='Lognormal fit, $\\mu$=' + str(round(mu, 2)) +
            '\n$\\sigma$=' + str(round(sigma, 2)) + ', $D$=' +
            str(round(sk_results_norm[0], 2)))

    print sk_results_norm, D

    ax.set_xlabel(label, fontsize=20)
    ax.set_ylabel('CDF of ' + label, fontsize=20)

    extra = ''
    if 'release' in filename:
        extra = '_release-max'

    if writeout:
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)

        xfit = parassms.lines[1].get_xdata()

        yfit = parassms.lines[1].get_ydata()
        write_row(
            out_folder + '/' + label + '_' + name + '_powerlaw_hist_' + extra +
            '.dat',
            [str(bins[i]) + '\t' + str(counts[i]) for i in range(len(counts))])
        write_row(
            out_folder + '/' + label + '_' + name + '_powerlaw_fit_' + extra +
            '.dat',
            [str(xfit[i]) + '\t' + str(yfit[i]) for i in range(len(xfit))])
        write_row(
            out_folder + '/' + label + '_' + name + '_lognormal_' + extra +
            '.dat', [
                str(x_rand[i]) + '\t' + str(pdf_fitted[i])
                for i in range(len(x_rand))
            ])
        write_row(
            out_folder + '/' + label + '_' + name + '_lognormal_pdf_' + extra +
            '.dat', [
                str(x_rand[i]) + '\t' + str(ppdf_fitted[i])
                for i in range(len(x_rand))
            ])

        f_Ddata = open(distancefile, 'a')
        f_Ddata.write(label + '\t' + str(D) + '\t' + str(sk_results_norm[0]) +
                      '\n')
        f_Ddata.close()

    return sk_results_norm[0], D
def get_imapct_distr():
    ''' ---------------------------------------------- '''
    '''      MOVIE YO                                  '''

    professions = [('director', 'k'), ('producer', 'b'), ('writer', 'r'),
                   ('composer', 'g'), ('art-director', 'y')]

    num_of_bins = 20
    title_font = 25
    seaborn.set_style('white')
    f, ax = plt.subplots(2, 3, figsize=(25, 15))
    st = f.suptitle("IMDb normalized impact distributions",
                    fontsize=title_font)

    FOLDER = 'ProcessedDataNormalized'

    for (label, color) in professions:

        num_car = str(
            int(
                round(
                    len(
                        os.listdir('Data/Film/film-' + label +
                                   '-simple-careers')) / 1000.0))) + 'k'

        file_avg = FOLDER + '/1_impact_distributions/film_average_ratings_dist_' + label + '.dat'
        file_cnt = FOLDER + '/1_impact_distributions/film_rating_counts_dist_' + label + '.dat'
        file_mets = FOLDER + '/1_impact_distributions/film_metascores_dist_' + label + '.dat'
        file_crit = FOLDER + '/1_impact_distributions/film_critic_review_dist_' + label + '.dat'
        file_user = FOLDER + '/1_impact_distributions/film_user_review_dist_' + label + '.dat'

        average_ratings = np.asarray(
            [round(float(line.strip()), 2) for line in open(file_avg)])
        rating_counts = [
            round(float(line.strip()), 2) for line in open(file_cnt)
        ]
        metascores = [
            round(float(line.strip()), 1) for line in open(file_mets)
        ]
        critic_review = [
            round(float(line.strip()), 2) for line in open(file_crit)
        ]
        user_review = [
            round(float(line.strip()), 2) for line in open(file_user)
        ]

        # plot avg ratings
        x_average_ratings, p_average_ratings = getDistribution(
            average_ratings, True)
        bx_average_ratings, bp_average_ratings, bperr_average_ratings = getBinnedDistribution(
            x_average_ratings, p_average_ratings, num_of_bins)

        ax[0, 0].set_title('IMDb - average rating', fontsize=20)
        ax[0, 0].plot(x_average_ratings,
                      p_average_ratings,
                      color,
                      marker='o',
                      alpha=0.1,
                      linewidth=0,
                      label=label + ', ' + str(num_car))
        ax[0, 0].errorbar(
            (bx_average_ratings[1:] + bx_average_ratings[:-1]) / 2,
            bp_average_ratings,
            yerr=bperr_average_ratings,
            fmt=color + '-',
            linewidth=2)

        # plot rating counts
        x_rating_counts, p_rating_counts = getDistribution(rating_counts, True)
        bx_rating_counts, bp_rating_counts, bperr_rating_counts = getBinnedDistribution(
            x_rating_counts, p_rating_counts, num_of_bins)

        ax[0, 1].set_title('IMDb - rating count', fontsize=20)
        ax[0, 1].set_xscale('log')
        ax[0, 1].set_yscale('log')
        ax[0, 1].plot(x_rating_counts,
                      p_rating_counts,
                      color + 'o',
                      alpha=0.8,
                      label=label)  # + ', ' + str(num_wr))
        #ax[0,1].errorbar((bx_rating_counts[1:] + bx_rating_counts[:-1])/2, bp_rating_counts, yerr=bperr_rating_counts, fmt='b-', linewidth = 2)

        # plot metascores
        x_metascores, p_metascores = getDistribution(metascores, True)
        bx_metascores, bp_metascores, bperr_metascores = getBinnedDistribution(
            x_metascores, p_metascores, num_of_bins)
        ax[0, 2].set_title('IMDb - metascores', fontsize=20)
        ax[0, 2].plot(x_metascores,
                      p_metascores,
                      color + 'o',
                      alpha=0.2,
                      label=label)  # + ', ' + str(len(metascores)))
        ax[0, 2].errorbar((bx_metascores[1:] + bx_metascores[:-1]) / 2,
                          bp_metascores,
                          yerr=bperr_metascores,
                          fmt=color + '-',
                          linewidth=2)

        # plot critic review count
        x_critic_review, p_critic_review = getDistribution(critic_review, True)
        ax[1, 0].set_title('IMDb - critic_review', fontsize=20)
        ax[1, 0].set_xscale('log')
        ax[1, 0].set_yscale('log')
        ax[1, 0].plot(x_critic_review,
                      p_critic_review,
                      color + 'o',
                      alpha=0.8,
                      label=label)  #+ ', ' + str(len(critic_review)))

        # plot user review count
        x_user_review, p_user_review = getDistribution(user_review, True)
        ax[1, 1].set_title('IMDb - user_review', fontsize=20)
        ax[1, 1].set_xscale('log')
        ax[1, 1].set_yscale('log')
        ax[1, 1].plot(x_user_review,
                      p_user_review,
                      color + 'o',
                      alpha=0.8,
                      label=label)  # + ', ' + str(len(user_review)))
    ''' ---------------------------------------------- '''
    '''      MOVIE YO                                  '''

    genres = [('electro', 'k'), ('pop', 'b')]

    for (genre, color) in genres:

        num_mus = str(
            int(
                round(
                    len(
                        os.listdir('Data/Music/music-' + genre +
                                   '-simple-careers')) / 1000.0))) + 'k'

        file_music = FOLDER + '/1_impact_distributions/music_rating_counts_dist_' + genre + '.dat'
        average_ratings = np.asarray(
            [round(float(line.strip())) for line in open(file_music)])
        x_rating_counts, p_rating_counts = getDistribution(
            average_ratings, True)

        print len(average_ratings)

        ax[1, 2].set_title('Music - playcount', fontsize=20)
        ax[1, 2].set_xscale('log')
        ax[1, 2].set_yscale('log')
        ax[1, 2].plot(x_rating_counts,
                      p_rating_counts,
                      color + 'o',
                      alpha=0.2,
                      label=genre + ', ' + num_mus)

    align_plot(ax)
    plt.savefig('impact_distributions_normalized.png')
    plt.close()
Exemplo n.º 10
0
def get_career_length():

    title_font = 25
    num_of_bins = 20
    seaborn.set_style('white')

    f, ax = plt.subplots(1, 2, figsize=(22, 9))
    st = f.suptitle("Career length distributions", fontsize=title_font)

    professions = [
        ('director', 'k'),
        ('producer', 'b'),
        ('writer', 'r'),
        ('composer', 'g'),
        ('art-director', 'y'),
    ]

    for (label, color) in professions[0:1]:

        career_length = [
            float(line.strip()) for line in
            open('ProcessedData/5_career_length/film_career_length_' + label +
                 '.dat')
        ]
        xcareer_length, pcareer_length = getDistribution(career_length)

        #ax[0].plot(xcareer_length, pcareer_length, color, marker = 'o', alpha = 0.3, linewidth = 0, label = label+ ', ')

        fitPowerLaw([c for c in career_length if c > 10], ax[0], label)

        #results = powerlaw.Fit(career_length, xmin = min(xcareer_length), fit_method = 'KS')
    # alpha  = results.power_law.alpha
    #D = results.power_law.KS()

    #results.plot_pdf(color=color, ax = ax[0],  linestyle='-', linewidth=0, marker = 'o', alpha = 0.5)
    # results.power_law.plot_pdf(color=color, ax = ax[0],  linestyle='-', linewidth=3, alpha = 0.9,  label ='$\\alpha$= ' + str(round(alpha,2)) + ', $D$='+str(round(D, 2)  ))

    ax[0].set_title('Length of director careers', fontsize=18)
    ax[1].set_title('Length of DJ careers', fontsize=18)
    professions = [('pop', 'k'), ('electro', 'b')]

    for (label, color) in professions[1:2]:

        career_length = [
            float(line.strip()) for line in
            open('ProcessedData/5_career_length/music_career_length_' + label +
                 '.dat')
        ]
        xcareer_length, pcareer_length = getDistribution(career_length)

        fitPowerLaw([c for c in career_length if c > 10], ax[1], label)
        '''
        results = powerlaw.Fit(career_length, xmin = min(xcareer_length), fit_method = 'KS')
        alpha  = results.power_law.alpha
        D = results.power_law.KS()
        results.plot_pdf(color=color, ax = ax[1],  linestyle = '-', linewidth = 0, marker = 'o', alpha = 0.5) 
        
        results.power_law.plot_pdf(color=color, ax = ax[1],  linestyle='-', linewidth=3, alpha = 0.9,  label ='$\\alpha$= ' + str(round(alpha,2)) + ', $D$='+str(round(D, 2)  ))     
        '''

    align_plot1D(ax)
    plt.savefig('career_length.png')
    plt.close()