def generate_bootstrap_histograms(data, title):
    """
    Generate histograms for the bootstrapped values.

    Parameters
    ----------
    data: dict, ex. {
                        'expert1': [ 1, 2, 1, 0, 0.5 ],
                        'expert2': [ 4, 5.5, 6, 4, 5 ]
                    }
    title: string, a title of what the distribution is. duh.
    """
    for expert, values in data.iteritems():
        ex_name = "".join(char for char in expert if char not in ".,")
        filename = title + "-" + ex_name
        filename = filename.strip().lower().replace(" ", "-")
        utils.histogram(
            data=values,
            filename="charts/fantasypros/{}.png".format(filename),
            title="{} - {}".format(title, expert),
            figsize=(10, 5),
            titlesize=26,
            xsize=26,
            xlim=(-3, 3),
            small=True,
        )
        confidence = np.percentile(values, q=[2.5, 50, 97.5])
        lower, mid, upper = [round(i, 2) for i in sorted(confidence)]
        msg = "95% {}: {} +/- {} (Lower: {} Mid: {} Upper: {})"
        print(msg.format(expert, mid, (mid - lower), lower, mid, upper))
def histogram_matching(img, ref, bins=256):
    assert img.shape == ref.shape

    result = img.copy()
    h, w = img.shape
    pixels = h * w

    # histogram
    hist_img = histogram(img)
    hist_ref = histogram(ref)
    # cumulative histogram
    cum_img = cumulative_histogram(hist_img)
    cum_ref = cumulative_histogram(hist_ref)
    # normalization
    prob_img = cum_img / pixels
    prob_ref = cum_ref / pixels

    new_values = np.zeros(bins)
    for a in range(bins):
        j = bins - 1
        while True:
            new_values[a] = j
            j = j - 1

            if j < 0 or prob_img[a] >= prob_ref[j]:
                break

    for i in range(h):
        for j in range(w):
            a = img.item(i, j)
            b = new_values[a]
            result.itemset((i, j), b)

    return result
def generate_bootstrap_histograms(data, title):
    """
    Generate histograms for the bootstrapped values.

    Parameters
    ----------
    data: dict, ex. {
                        'expert1': [ 1, 2, 1, 0, 0.5 ],
                        'expert2': [ 4, 5.5, 6, 4, 5 ]
                    }
    title: string, a title of what the distribution is. duh.
    """
    for expert, values in data.iteritems():
        ex_name = ''.join(char for char in expert if char not in '.,')
        filename = title + '-' + ex_name
        filename = filename.strip().lower().replace(' ', '-')
        utils.histogram(
            data=values,
            filename='charts/fantasypros/{}.png'.format(filename),
            title='{} - {}'.format(title, expert),
            figsize=(10,5),
            titlesize=26,
            xsize=26,
            xlim=(-3, 3),
            small=True
        )
        confidence = np.percentile(values, q=[2.5, 50, 97.5])
        lower, mid, upper = [round(i, 2) for i in sorted(confidence)]
        msg = '95% {}: {} +/- {} (Lower: {} Mid: {} Upper: {})'
        print(msg.format(expert, mid, (mid-lower), lower, mid, upper))
Пример #4
0
    def encode(self, source):
        """Koduje wejściowy ciąg danych przy pomocy wykładniczego kodu Golomba.

        Argumenty:
            source (List[int]): ciąg liczb naturalnych do zakodowania

        Zwraca:
            BitStream: strumień bitowy zawierający ciąg słów kodowych oraz opcjonalnie
                nagłówek (przy pośrednim trybie pracy kodera).
        """
        stream = BitStream()
        self._source = source
        self._hist = histogram(source)
        # Utworzenie i zapisanie w nagłówku książki kodów (jeżeli wybrano tryb pośredni)
        if not self._direct:
            self._codebook = self._make_codebook(stream)
        header_len = len(stream)
        # Kodowanie danych źródłowych
        for word in source:
            self._encode_word(word, stream)
        # Obliczenie statystyk
        self._stream_len = len(stream)
        self._stream_data_len = len(stream) - header_len
        self._stats = Statistics(self)
        return stream
def generate_error_histograms(df, column, title):
    """
    Generate actual error distributions for each expert.
    Plots the distribution of the given column.
    """
    for expert in df.EXPERT.unique().tolist():
        ex_name = "".join(char for char in expert if char not in ".,")
        filename = title + "-" + ex_name
        filename = filename.strip().lower().replace(" ", "-")
        utils.histogram(
            data=df[df.EXPERT == expert][column],
            filename="charts/fantasypros/{}.png".format(filename),
            title="{} - {}".format(title, expert),
            figsize=(10, 5),
            titlesize=26,
            xsize=26,
            xlim=(-40, 40),
            small=True,
        )
def generate_error_histograms(df, column, title):
    """
    Generate actual error distributions for each expert.
    Plots the distribution of the given column.
    """
    for expert in df.EXPERT.unique().tolist():
        ex_name = ''.join(char for char in expert if char not in '.,')
        filename = title + '-' + ex_name
        filename = filename.strip().lower().replace(' ', '-')
        utils.histogram(
            data=df[ df.EXPERT == expert ][column],
            filename='charts/fantasypros/{}.png'.format(filename),
            title='{} - {}'.format(title, expert),
            figsize=(10,5),
            titlesize=26,
            xsize=26,
            xlim=(-40, 40),
            small=True
        )
Пример #7
0
def frequency(data, column, n):
    counts = histogram(data[column].values)
    if len(counts) < n:
        n = len(counts)
    labels, x = unzip(counts)
    _, ax = plt.subplots(figsize=(10, 4))
    y = list(range(n))
    ax.barh(y, x[-n:])
    plt.yticks(y, tuple(labels[-n:]), fontsize=7)
    plt.tight_layout()
    plt.savefig("pngs/{}_frequency.png".format(column))
    plt.close()
    return counts, n
Пример #8
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    # Parser and args
    parser = create_parser()
    args = parser.parse_args()
    # Setup resources and dirs
    dest = open(args.out, 'w')
    res_dir = os.path.split(os.path.abspath(__file__))[0]
    template = open(os.path.join(res_dir, 'template.html'), 'r').read()
    output = HTMLOutput(dest, template)
    cache_dir = os.path.split(args.out)[0]
    # Use cache
    dbs = [FilmwebDatabase()]
    if not args.force:
        cache = load_cache(cache_dir, args.out)
        if cache:
            logging.info("using cache file")
            dbs = cache

    # Get movies
    movies = find_movies_info(args.dirs, dbs, output, '-rating')

    # Histogram?
    if args.histogram:
        path = os.path.join(cache_dir, '.movierank-histogram.png')
        histogram(movies, path)
        output.add_extra('histogram', path)

    # Finish
    store_cache(cache_dir, dbs, suffix=args.out)
    output.flush()

    # Run browser?
    if args.run:
        subprocess.Popen(["xdg-open", args.out],
                         stderr=subprocess.STDOUT,
                         stdout=subprocess.PIPE)
Пример #9
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    # Parser and args
    parser = create_parser()
    args = parser.parse_args()
    # Setup resources and dirs
    dest = open(args.out, 'w')
    res_dir = os.path.split(os.path.abspath(__file__))[0]
    template = open(os.path.join(res_dir, 'template.html'), 'r').read()
    output = HTMLOutput(dest, template)
    cache_dir = os.path.split(args.out)[0]
    # Use cache
    dbs = [FilmwebDatabase()]
    if not args.force:
        cache = load_cache(cache_dir, args.out)
        if cache:
            logging.info("using cache file")
            dbs = cache

    # Get movies
    movies = find_movies_info(args.dirs, dbs, output, '-rating')

    # Histogram?
    if args.histogram:
        path = os.path.join(cache_dir, '.movierank-histogram.png')
        histogram(movies, path)
        output.add_extra('histogram', path)

    # Finish
    store_cache(cache_dir, dbs, suffix=args.out)
    output.flush()

    # Run browser?
    if args.run:
        subprocess.Popen(["xdg-open", args.out],
                         stderr=subprocess.STDOUT,
                         stdout=subprocess.PIPE)
Пример #10
0
def main():
    args = parser.parse_args()

    with open(args.data, 'r') as f:
        data_serial = f.read()

    data_json = json.loads(data_serial)

    # tag => concatenated articles
    tagged_corpus_by_articles = defaultdict(lambda: [])

    for example in data_json:
        tag = re.sub('\s', '_', example['tag']).lower()
        tagged_corpus_by_articles[tag].append(example['content'])

    tagged_corpus = {
        tag: histogram(' '.join(articles).split())
        for tag, articles in tagged_corpus_by_articles.iteritems()
    }

    if not args.output:
        args.output = os.path.join(os.path.dirname(args.data), 'classifier')

    if not os.path.isdir(args.output):
        os.makedirs(args.output)

    vocab = set()

    for tag, card in tagged_corpus.iteritems():
        filepath = os.path.join(args.output, tag)
        vocab |= set(card.keys())
        with open(filepath, 'w+') as f:
            for w, c in sorted(card.iteritems(),
                               key=lambda (w, c): c,
                               reverse=True):
                print >> f, "{} {}".format(c, w)
            print >> f

    with open(os.path.join(args.output, parameters.PRIORS_FILE), 'w+') as f:
        for tag, articles in tagged_corpus_by_articles.iteritems():
            print >> f, "{} {}".format(len(articles), tag)

    with open(os.path.join(args.output, parameters.VOCAB_FILE), 'w+') as f:
        for w in sorted(vocab):
            print >> f, w
Пример #11
0
 def __init__(self, codec=None):
     if codec:
         self._source_len = len(codec._source)
         self._entropy = entropy(codec._source)
         self._hist = codec._hist if codec._hist else histogram(codec._source)
         self._symbol_size = int(math.ceil(math.log(max(self._hist.keys()) or 1, 2)))
         self._cr = float(self._source_len) * self._symbol_size / codec._stream_len
         self._mean_code_len = float(codec._stream_data_len) / self._source_len
         self._source_size = self._symbol_size * self._source_len
         self._stream_size = codec._stream_len
     else:
         self._source_len = 0
         self._entropy = 0
         self._hist = {}
         self._cr = 0
         self._mean_code_len = 0
         self._symbol_len = 0
         self._source_size = 0
         self._stream_size = 0
Пример #12
0
def main():
    args = parser.parse_args()

    with open(args.data, 'r') as f:
        data_serial = f.read()

    data_json = json.loads(data_serial)

    # tag => concatenated articles
    tagged_corpus_by_articles = defaultdict(lambda: [])

    for example in data_json:
        tag = re.sub('\s', '_', example['tag']).lower()
        tagged_corpus_by_articles[tag].append(example['content'])

    tagged_corpus = {tag: histogram(' '.join(articles).split()) for tag, articles in tagged_corpus_by_articles.iteritems()}

    if not args.output:
        args.output = os.path.join(os.path.dirname(args.data), 'classifier')

    if not os.path.isdir(args.output):
        os.makedirs(args.output)

    vocab = set()

    for tag, card in tagged_corpus.iteritems():
        filepath = os.path.join(args.output, tag)
        vocab |= set(card.keys())
        with open(filepath, 'w+') as f:
            for w, c in sorted(card.iteritems(), key=lambda (w, c): c, reverse=True):
                print >> f, "{} {}".format(c, w)
            print >> f

    with open(os.path.join(args.output, parameters.PRIORS_FILE), 'w+') as f:
        for tag, articles in tagged_corpus_by_articles.iteritems():
            print >> f, "{} {}".format(len(articles), tag)

    with open(os.path.join(args.output, parameters.VOCAB_FILE), 'w+') as f:
        for w in sorted(vocab):
            print >> f, w
    def get_shrunk_channels(self, src):
        shrink = self.options["shrink"]
        n_orient = self.options["n_orient"]
        grd_smooth_rad = self.options["grd_smooth_rad"]
        grd_norm_rad = self.options["grd_norm_rad"]

        luv = rgb2luv(src)
        size = (luv.shape[0] / shrink, luv.shape[1] / shrink)
        channels = [resize(luv, size)]

        for scale in [1.0, 0.5]:
            img = resize(luv, (luv.shape[0] * scale, luv.shape[1] * scale))
            img = conv_tri(img, grd_smooth_rad)

            magnitude, orientation = gradient(img, grd_norm_rad)

            downscale = max(1, int(shrink * scale))
            hist = histogram(magnitude, orientation, downscale, n_orient)

            channels.append(resize(magnitude, size)[:, :, None])
            channels.append(resize(hist, size))

        channels = N.concatenate(channels, axis=2)

        reg_smooth_rad = self.options["reg_smooth_rad"] / float(shrink)
        ss_smooth_rad = self.options["ss_smooth_rad"] / float(shrink)

        if reg_smooth_rad > 1.0:
            reg_ch = conv_tri(channels, int(round(reg_smooth_rad)))
        else:
            reg_ch = conv_tri(channels, reg_smooth_rad)

        if ss_smooth_rad > 1.0:
            ss_ch = conv_tri(channels, int(round(ss_smooth_rad)))
        else:
            ss_ch = conv_tri(channels, ss_smooth_rad)

        return reg_ch, ss_ch
Пример #14
0
    def train(self, data):
        self._processor.process_examples(data)

        articles_per_tag = defaultdict(lambda: [])
        for example in data:
            tag = self.normalize_tag_label(example['tag'])
            if tag in self.IGNORE_TAGS:
                continue
            articles_per_tag[tag].append(example['tokens'])

        self._ntokens_per_tag = {
            tag: histogram(token for article in articles for token in article)
            for tag, articles in articles_per_tag.iteritems()
        }
        self._ndocs_per_tag = {
            tag: len(articles)
            for tag, articles in articles_per_tag.iteritems()
        }
        self._ndocs = sum(self._ndocs_per_tag.values())
        self._vocab = set(t
                          for tag, tokens in self._ntokens_per_tag.iteritems()
                          for t in tokens.keys())
        self._tags = list(self._ntokens_per_tag.keys())
        self._weights = self._compute_weights()

        for tag, tokens in self._ntokens_per_tag.iteritems():
            total = sum(tokens.values())
            with open(
                    '/Users/bernardorufino/pastebin/classifier/{}.dat'.format(
                        tag), 'w') as f:
                for token, n in sorted(tokens.iteritems(),
                                       key=lambda (t, n): n,
                                       reverse=True):
                    f.write("{:<14} {:<5} {:<5.2f} {:<5.2f}\n".format(
                        token, n,
                        float(n) / total, self._weights[token]))
                f.write('\n')
Пример #15
0
    def train(self, data):
        self._processor.process_examples(data)

        articles_per_tag = defaultdict(lambda: [])
        for example in data:
            tag = self.normalize_tag_label(example['tag'])
            if tag in self.IGNORE_TAGS:
                continue
            articles_per_tag[tag].append(example['tokens'])

        self._ntokens_per_tag = {tag: histogram(token for article in articles for token in article)
                                 for tag, articles in articles_per_tag.iteritems()}
        self._ndocs_per_tag = {tag: len(articles) for tag, articles in articles_per_tag.iteritems()}
        self._ndocs = sum(self._ndocs_per_tag.values())
        self._vocab = set(t for tag, tokens in self._ntokens_per_tag.iteritems() for t in tokens.keys())
        self._tags = list(self._ntokens_per_tag.keys())
        self._weights = self._compute_weights()

        for tag, tokens in self._ntokens_per_tag.iteritems():
            total = sum(tokens.values())
            with open('/Users/bernardorufino/pastebin/classifier/{}.dat'.format(tag), 'w') as f:
                for token, n in sorted(tokens.iteritems(), key=lambda (t, n): n, reverse=True):
                    f.write("{:<14} {:<5} {:<5.2f} {:<5.2f}\n".format(token, n, float(n) / total, self._weights[token]))
                f.write('\n')
Пример #16
0
def dram_multiple_contours(img, contours, max_contours=10, approximate=False):
    # draw in blue the contours that were founded
    image_entropy = img.copy()
    cv2.drawContours(img, contours, -1, 255, 3)

    # find the biggest countour (c) by the area
    c = sorted(contours, key=cv2.contourArea, reverse=True)

    # draw the biggest contour (c) in green
    overlap_area = np.zeros((max_contours, 4))
    for i in range(max_contours):
        x, y, w, h = cv2.boundingRect(c[i])

        entropy_computed = (entropy(
            histogram(crop_image(image_entropy, (x, y, w, h)))))
        print(overlap_area)

        if entropy_computed > 7:
            if not overlap(overlap_area, (x, y, w, h), i):
                print(overlap(overlap_area, (x, y, w, h), i))
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

                print(x, y, w, h)
                overlap_area[i, :] = x, y, w, h
Пример #17
0
    for C in C_list_log:
        clf = SVM.gaussian_kernel(label, data, 0.0, 80, 10**C)
        free_SV, free_SV_coef = SVM.free_SV(clf, 10**C)
        SV = SVM.get_SV(clf)
        SV_coef = SVM.get_dual_coef(clf)
        dis = SVM.cal_dis(SV, SV_coef[0], free_SV[0])
        dis_list.append(dis)
    utils.curve(C_list_log, dis_list, '14.png', 'log(C)', 'dis')
    
    # question 15
    gamma_list = [0, 1, 2, 3, 4]
    C = 0.1
    E_out_list = []
    for gamma in gamma_list:
        clf = SVM.gaussian_kernel(label, data, 0.0, 10**gamma, C)
        E_out_list.append(SVM.error_0_1(utils.which_binary(test_label, 0), test_data, clf))
    utils.curve(gamma_list, E_out_list, '15.png', 'log(gamma)', 'E_out')

    # question 16
    C = 0.1
    gamma_list = [-1, 0, 1, 2, 3]
    gamma_pick = [0, 0, 0, 0, 0]
    for i in xrange(100):
        val_label, val_data, train_label, train_data = utils.split_data(label, data, 1000)
        E_val_list = []
        for gamma in gamma_list:
            clf = SVM.gaussian_kernel(train_label, train_data, 0.0, 10**gamma, C)
            E_val_list.append(SVM.error_0_1(utils.which_binary(val_label, 0), val_data, clf))
        gamma_pick[E_val_list.index(max(E_val_list))] += 1
    utils.histogram(gamma_list, gamma_pick, '16.png', 'log(gamma)', '#selected')
Пример #18
0
 def processInputData(self, *args):
     image, label = super(GramHistoResizeModel,
                          self).processInputData(*args)
     return image, histogram(image), label
train_lh, train_prior = naive_bayes.naive_bayes(train_data)

#=====================================
# 2j. plot and predict the movies
#
movies = ['Finding Nemo', 'The Matrix', 'Gone with the Wind', 'Harry Potter and the Goblet of Fire', 'Avatar']
test_movies = findMovie(all_movies, movies)
for tm in test_movies:
    predicted_y = naive_bayes.predict(train_lh, train_prior, utils.bags(tm['summary']))
    minY, maxY = naive_bayes.findMinMaxY(predicted_y)
    x = []
    y = []
    for year in predicted_y:
        x.append(year)
        y.append(predicted_y[year]+abs(predicted_y[minY]))
    utils.histogram(x, y, 'Decade', 'Posterior Probability', tm['title']+' ('+str(tm['year'])+') Histogram of Posterior Probability for each decade')
    print tm['title']+' is done.', 'Predicted decade '+str(maxY), 'Real decade '+str(tm['year'])

#======================================
# 2k. Accuracy measurement
#
accuracy = 0
for d in test_data:
    predicted_y = naive_bayes.predict(train_lh, train_prior, utils.bags(d['summary']))
    minY, maxY = naive_bayes.findMinMaxY(predicted_y)
    if maxY == d['year']:
        accuracy += 1

accuracy /= float(len(test_data))
print 'The accuracy of the model on test data is ', accuracy
Пример #20
0
        3.03, 1.79, 0.78, 0.82, 0.00, 0.92, 0.69, 1.07, 2.26, 0.61, 0.62, 0.00, 1.10, 0.86, 1.17, 0.48, 1.09, 0.53,
        0.94, 0.63, 0.63, 0.86, 0.68, 0.63, 0.49, 0.44, 0.33, 0.28, 0.36, 0.99, 0.49, 0.53, 0.65, 0.49, 0.73, 0.48,
        0.40, 0.90, 0.80, 0.52, 0.67, 0.94, 0.89, 0.69, 0.62, 0.84, 0.29, 0.51, 0.75, 0.52, 0.99, 0.30, 0.36, 0.48,
        0.48, 0.31, 0.38, 0.33, 0.35, 0.50, 1.31, 0.34, 0.43, 0.52, 0.32, 0.56, 0.62, 0.56, 0.79, 0.30, 0.53, 0.36,
        0.47, 0.33, 0.50, 0.63, 0.65, 0.49, 0.42, 0.34, 0.45, 0.53, 5.17, 0.63, 0.61, 0.65, 0.39, 0.53, 0.73, 0.39,
        0.39, 0.29, 0.29, 0.28, 0.47, 0.36, 0.86, 0.53, 0.50, 0.29, 0.45, 0.49, 0.44, 0.25, 0.31, 0.40, 0.63, 0.26,
        0.71, 0.58, 0.57, 0.41, 0.53, 1.16, 0.32, 0.14, 0.15, 0.23, 0.10, 0.15, 1.20, 0.52
)

# %%
# Campaign 0 (Stage 1) Stamps
# Campaign 1 (Stage 2) Pages and stamps <<<
# Campaign 2 (Stage 3) Only pages
# Campaign 4 (Stage 5) Pages and stamps <<<

utils.histogram(data=stage2b, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Stage2 - Time per image - Reproduced results', bins='auto', counter=0)

utils.histogram(data=stage3b, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Stage3 - Time per image - Reproduced results', bins='auto', counter=0)

utils.histogram(data=stage4b, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Campaign4 - Time per image', bins='auto', counter=0)

# %%

utils.histogram(data=stage2_labels, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Stage2 - Time per label', bins='auto', counter=0)

utils.histogram(data=stage3_labels, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Stage3 - Time per label', bins='auto', counter=0)
Пример #21
0
all_movies = list(
    parse_movies.load_all_movies(os.path.join(config.baseDir,
                                              config.data_file)))

#==============================================
# 2a. PMF of P(Y)
#==============================================
pmf, data_year = cal_pmf(all_movies)
n = len(data_year)
x = []
y = []
for year, amount in pmf.iteritems():
    x.append(year)
    y.append(float(amount) / float(n))
utils.histogram(x, y, 'Decade', 'PMF', 'PMF of P(Y)')
print 'PMF of P(Y) done'

#==============================================
# 2b. PMF of P(Y|X"radio">0)
#==============================================
pmf, data_year = cal_pmf(all_movies, 'radio')
n = len(data_year)
x = []
y = []
for year, amount in pmf.iteritems():
    x.append(year)
    y.append(float(amount) / float(n))
utils.histogram(x, y, 'Decade', 'PMF', 'PMF of P(Y|X"radio">0)')
print 'PMF of P(Y|X"radio">0) done'
Пример #22
0
Created on 7 mar. 2017

Generates the histograms needed in the task
@author: jorge
'''
from utils import histogram
import networkx as nx
import matplotlib.pyplot as plt

path='A1-networks/'
files=['model/ER1000k8.net', 'model/SF_1000_g2.7.net', 'model/ws1000.net', 'real/airports_UW.net']
names=['ER1000k8', 'SF_1000_g2.7', 'ws1000', 'airports_UW']

for i in range(len(files)):
    G=nx.read_pajek(path+files[i])
    plt=histogram(G, log=True, norm=True, n=10)
    plt.title('Log histogram for '+names[i])
    plt.savefig('log_'+names[i]+'.png')
    plt.clf()
    plt=histogram(G, log=False, norm=True, n=10)
    plt.title('Normed histogram for '+ names[i])
    plt.savefig('norm_'+names[i]+'.png')
    plt.clf()
    plt=histogram(G, log=True, norm=True,cumu=-1, n=10)
    plt.title('Cumulative log histogram for '+names[i])
    plt.savefig('Cumu_log_'+names[i]+'.png')
    plt.clf()
    plt=histogram(G, log=False, norm=True,cumu=-1, n=10)
    plt.title('Cumulative normed histogram for '+ names[i])
    plt.savefig('Cumu_norm_'+names[i]+'.png')
    plt.clf()
movies = [
    'Finding Nemo', 'The Matrix', 'Gone with the Wind',
    'Harry Potter and the Goblet of Fire', 'Avatar'
]
test_movies = findMovie(all_movies, movies)
for tm in test_movies:
    predicted_y = naive_bayes.predict(train_lh, train_prior,
                                      utils.bags(tm['summary']))
    minY, maxY = naive_bayes.findMinMaxY(predicted_y)
    x = []
    y = []
    for year in predicted_y:
        x.append(year)
        y.append(predicted_y[year] + abs(predicted_y[minY]))
    utils.histogram(
        x, y, 'Decade', 'Posterior Probability',
        tm['title'] + ' (' + str(tm['year']) +
        ') Histogram of Posterior Probability for each decade')
    print tm['title'] + ' is done.', 'Predicted decade ' + str(
        maxY), 'Real decade ' + str(tm['year'])

#======================================
# 2k. Accuracy measurement
#
accuracy = 0
for d in test_data:
    predicted_y = naive_bayes.predict(train_lh, train_prior,
                                      utils.bags(d['summary']))
    minY, maxY = naive_bayes.findMinMaxY(predicted_y)
    if maxY == d['year']:
        accuracy += 1