def generate_bootstrap_histograms(data, title):
    """
    Generate histograms for the bootstrapped values.

    Parameters
    ----------
    data: dict, ex. {
                        'expert1': [ 1, 2, 1, 0, 0.5 ],
                        'expert2': [ 4, 5.5, 6, 4, 5 ]
                    }
    title: string, a title of what the distribution is. duh.
    """
    for expert, values in data.iteritems():
        ex_name = "".join(char for char in expert if char not in ".,")
        filename = title + "-" + ex_name
        filename = filename.strip().lower().replace(" ", "-")
        utils.histogram(
            data=values,
            filename="charts/fantasypros/{}.png".format(filename),
            title="{} - {}".format(title, expert),
            figsize=(10, 5),
            titlesize=26,
            xsize=26,
            xlim=(-3, 3),
            small=True,
        )
        confidence = np.percentile(values, q=[2.5, 50, 97.5])
        lower, mid, upper = [round(i, 2) for i in sorted(confidence)]
        msg = "95% {}: {} +/- {} (Lower: {} Mid: {} Upper: {})"
        print(msg.format(expert, mid, (mid - lower), lower, mid, upper))
def histogram_matching(img, ref, bins=256):
    assert img.shape == ref.shape

    result = img.copy()
    h, w = img.shape
    pixels = h * w

    # histogram
    hist_img = histogram(img)
    hist_ref = histogram(ref)
    # cumulative histogram
    cum_img = cumulative_histogram(hist_img)
    cum_ref = cumulative_histogram(hist_ref)
    # normalization
    prob_img = cum_img / pixels
    prob_ref = cum_ref / pixels

    new_values = np.zeros(bins)
    for a in range(bins):
        j = bins - 1
        while True:
            new_values[a] = j
            j = j - 1

            if j < 0 or prob_img[a] >= prob_ref[j]:
                break

    for i in range(h):
        for j in range(w):
            a = img.item(i, j)
            b = new_values[a]
            result.itemset((i, j), b)

    return result
def generate_bootstrap_histograms(data, title):
    """
    Generate histograms for the bootstrapped values.

    Parameters
    ----------
    data: dict, ex. {
                        'expert1': [ 1, 2, 1, 0, 0.5 ],
                        'expert2': [ 4, 5.5, 6, 4, 5 ]
                    }
    title: string, a title of what the distribution is. duh.
    """
    for expert, values in data.iteritems():
        ex_name = ''.join(char for char in expert if char not in '.,')
        filename = title + '-' + ex_name
        filename = filename.strip().lower().replace(' ', '-')
        utils.histogram(
            data=values,
            filename='charts/fantasypros/{}.png'.format(filename),
            title='{} - {}'.format(title, expert),
            figsize=(10,5),
            titlesize=26,
            xsize=26,
            xlim=(-3, 3),
            small=True
        )
        confidence = np.percentile(values, q=[2.5, 50, 97.5])
        lower, mid, upper = [round(i, 2) for i in sorted(confidence)]
        msg = '95% {}: {} +/- {} (Lower: {} Mid: {} Upper: {})'
        print(msg.format(expert, mid, (mid-lower), lower, mid, upper))
예제 #4
0
파일: codec.py 프로젝트: zbanach/koda
    def encode(self, source):
        """Koduje wejściowy ciąg danych przy pomocy wykładniczego kodu Golomba.

        Argumenty:
            source (List[int]): ciąg liczb naturalnych do zakodowania

        Zwraca:
            BitStream: strumień bitowy zawierający ciąg słów kodowych oraz opcjonalnie
                nagłówek (przy pośrednim trybie pracy kodera).
        """
        stream = BitStream()
        self._source = source
        self._hist = histogram(source)
        # Utworzenie i zapisanie w nagłówku książki kodów (jeżeli wybrano tryb pośredni)
        if not self._direct:
            self._codebook = self._make_codebook(stream)
        header_len = len(stream)
        # Kodowanie danych źródłowych
        for word in source:
            self._encode_word(word, stream)
        # Obliczenie statystyk
        self._stream_len = len(stream)
        self._stream_data_len = len(stream) - header_len
        self._stats = Statistics(self)
        return stream
def generate_error_histograms(df, column, title):
    """
    Generate actual error distributions for each expert.
    Plots the distribution of the given column.
    """
    for expert in df.EXPERT.unique().tolist():
        ex_name = "".join(char for char in expert if char not in ".,")
        filename = title + "-" + ex_name
        filename = filename.strip().lower().replace(" ", "-")
        utils.histogram(
            data=df[df.EXPERT == expert][column],
            filename="charts/fantasypros/{}.png".format(filename),
            title="{} - {}".format(title, expert),
            figsize=(10, 5),
            titlesize=26,
            xsize=26,
            xlim=(-40, 40),
            small=True,
        )
def generate_error_histograms(df, column, title):
    """
    Generate actual error distributions for each expert.
    Plots the distribution of the given column.
    """
    for expert in df.EXPERT.unique().tolist():
        ex_name = ''.join(char for char in expert if char not in '.,')
        filename = title + '-' + ex_name
        filename = filename.strip().lower().replace(' ', '-')
        utils.histogram(
            data=df[ df.EXPERT == expert ][column],
            filename='charts/fantasypros/{}.png'.format(filename),
            title='{} - {}'.format(title, expert),
            figsize=(10,5),
            titlesize=26,
            xsize=26,
            xlim=(-40, 40),
            small=True
        )
예제 #7
0
def frequency(data, column, n):
    counts = histogram(data[column].values)
    if len(counts) < n:
        n = len(counts)
    labels, x = unzip(counts)
    _, ax = plt.subplots(figsize=(10, 4))
    y = list(range(n))
    ax.barh(y, x[-n:])
    plt.yticks(y, tuple(labels[-n:]), fontsize=7)
    plt.tight_layout()
    plt.savefig("pngs/{}_frequency.png".format(column))
    plt.close()
    return counts, n
예제 #8
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    # Parser and args
    parser = create_parser()
    args = parser.parse_args()
    # Setup resources and dirs
    dest = open(args.out, 'w')
    res_dir = os.path.split(os.path.abspath(__file__))[0]
    template = open(os.path.join(res_dir, 'template.html'), 'r').read()
    output = HTMLOutput(dest, template)
    cache_dir = os.path.split(args.out)[0]
    # Use cache
    dbs = [FilmwebDatabase()]
    if not args.force:
        cache = load_cache(cache_dir, args.out)
        if cache:
            logging.info("using cache file")
            dbs = cache

    # Get movies
    movies = find_movies_info(args.dirs, dbs, output, '-rating')

    # Histogram?
    if args.histogram:
        path = os.path.join(cache_dir, '.movierank-histogram.png')
        histogram(movies, path)
        output.add_extra('histogram', path)

    # Finish
    store_cache(cache_dir, dbs, suffix=args.out)
    output.flush()

    # Run browser?
    if args.run:
        subprocess.Popen(["xdg-open", args.out],
                         stderr=subprocess.STDOUT,
                         stdout=subprocess.PIPE)
예제 #9
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    # Parser and args
    parser = create_parser()
    args = parser.parse_args()
    # Setup resources and dirs
    dest = open(args.out, 'w')
    res_dir = os.path.split(os.path.abspath(__file__))[0]
    template = open(os.path.join(res_dir, 'template.html'), 'r').read()
    output = HTMLOutput(dest, template)
    cache_dir = os.path.split(args.out)[0]
    # Use cache
    dbs = [FilmwebDatabase()]
    if not args.force:
        cache = load_cache(cache_dir, args.out)
        if cache:
            logging.info("using cache file")
            dbs = cache

    # Get movies
    movies = find_movies_info(args.dirs, dbs, output, '-rating')

    # Histogram?
    if args.histogram:
        path = os.path.join(cache_dir, '.movierank-histogram.png')
        histogram(movies, path)
        output.add_extra('histogram', path)

    # Finish
    store_cache(cache_dir, dbs, suffix=args.out)
    output.flush()

    # Run browser?
    if args.run:
        subprocess.Popen(["xdg-open", args.out],
                         stderr=subprocess.STDOUT,
                         stdout=subprocess.PIPE)
예제 #10
0
def main():
    args = parser.parse_args()

    with open(args.data, 'r') as f:
        data_serial = f.read()

    data_json = json.loads(data_serial)

    # tag => concatenated articles
    tagged_corpus_by_articles = defaultdict(lambda: [])

    for example in data_json:
        tag = re.sub('\s', '_', example['tag']).lower()
        tagged_corpus_by_articles[tag].append(example['content'])

    tagged_corpus = {
        tag: histogram(' '.join(articles).split())
        for tag, articles in tagged_corpus_by_articles.iteritems()
    }

    if not args.output:
        args.output = os.path.join(os.path.dirname(args.data), 'classifier')

    if not os.path.isdir(args.output):
        os.makedirs(args.output)

    vocab = set()

    for tag, card in tagged_corpus.iteritems():
        filepath = os.path.join(args.output, tag)
        vocab |= set(card.keys())
        with open(filepath, 'w+') as f:
            for w, c in sorted(card.iteritems(),
                               key=lambda (w, c): c,
                               reverse=True):
                print >> f, "{} {}".format(c, w)
            print >> f

    with open(os.path.join(args.output, parameters.PRIORS_FILE), 'w+') as f:
        for tag, articles in tagged_corpus_by_articles.iteritems():
            print >> f, "{} {}".format(len(articles), tag)

    with open(os.path.join(args.output, parameters.VOCAB_FILE), 'w+') as f:
        for w in sorted(vocab):
            print >> f, w
예제 #11
0
파일: codec.py 프로젝트: zbanach/koda
 def __init__(self, codec=None):
     if codec:
         self._source_len = len(codec._source)
         self._entropy = entropy(codec._source)
         self._hist = codec._hist if codec._hist else histogram(codec._source)
         self._symbol_size = int(math.ceil(math.log(max(self._hist.keys()) or 1, 2)))
         self._cr = float(self._source_len) * self._symbol_size / codec._stream_len
         self._mean_code_len = float(codec._stream_data_len) / self._source_len
         self._source_size = self._symbol_size * self._source_len
         self._stream_size = codec._stream_len
     else:
         self._source_len = 0
         self._entropy = 0
         self._hist = {}
         self._cr = 0
         self._mean_code_len = 0
         self._symbol_len = 0
         self._source_size = 0
         self._stream_size = 0
예제 #12
0
def main():
    args = parser.parse_args()

    with open(args.data, 'r') as f:
        data_serial = f.read()

    data_json = json.loads(data_serial)

    # tag => concatenated articles
    tagged_corpus_by_articles = defaultdict(lambda: [])

    for example in data_json:
        tag = re.sub('\s', '_', example['tag']).lower()
        tagged_corpus_by_articles[tag].append(example['content'])

    tagged_corpus = {tag: histogram(' '.join(articles).split()) for tag, articles in tagged_corpus_by_articles.iteritems()}

    if not args.output:
        args.output = os.path.join(os.path.dirname(args.data), 'classifier')

    if not os.path.isdir(args.output):
        os.makedirs(args.output)

    vocab = set()

    for tag, card in tagged_corpus.iteritems():
        filepath = os.path.join(args.output, tag)
        vocab |= set(card.keys())
        with open(filepath, 'w+') as f:
            for w, c in sorted(card.iteritems(), key=lambda (w, c): c, reverse=True):
                print >> f, "{} {}".format(c, w)
            print >> f

    with open(os.path.join(args.output, parameters.PRIORS_FILE), 'w+') as f:
        for tag, articles in tagged_corpus_by_articles.iteritems():
            print >> f, "{} {}".format(len(articles), tag)

    with open(os.path.join(args.output, parameters.VOCAB_FILE), 'w+') as f:
        for w in sorted(vocab):
            print >> f, w
    def get_shrunk_channels(self, src):
        shrink = self.options["shrink"]
        n_orient = self.options["n_orient"]
        grd_smooth_rad = self.options["grd_smooth_rad"]
        grd_norm_rad = self.options["grd_norm_rad"]

        luv = rgb2luv(src)
        size = (luv.shape[0] / shrink, luv.shape[1] / shrink)
        channels = [resize(luv, size)]

        for scale in [1.0, 0.5]:
            img = resize(luv, (luv.shape[0] * scale, luv.shape[1] * scale))
            img = conv_tri(img, grd_smooth_rad)

            magnitude, orientation = gradient(img, grd_norm_rad)

            downscale = max(1, int(shrink * scale))
            hist = histogram(magnitude, orientation, downscale, n_orient)

            channels.append(resize(magnitude, size)[:, :, None])
            channels.append(resize(hist, size))

        channels = N.concatenate(channels, axis=2)

        reg_smooth_rad = self.options["reg_smooth_rad"] / float(shrink)
        ss_smooth_rad = self.options["ss_smooth_rad"] / float(shrink)

        if reg_smooth_rad > 1.0:
            reg_ch = conv_tri(channels, int(round(reg_smooth_rad)))
        else:
            reg_ch = conv_tri(channels, reg_smooth_rad)

        if ss_smooth_rad > 1.0:
            ss_ch = conv_tri(channels, int(round(ss_smooth_rad)))
        else:
            ss_ch = conv_tri(channels, ss_smooth_rad)

        return reg_ch, ss_ch
예제 #14
0
    def train(self, data):
        self._processor.process_examples(data)

        articles_per_tag = defaultdict(lambda: [])
        for example in data:
            tag = self.normalize_tag_label(example['tag'])
            if tag in self.IGNORE_TAGS:
                continue
            articles_per_tag[tag].append(example['tokens'])

        self._ntokens_per_tag = {
            tag: histogram(token for article in articles for token in article)
            for tag, articles in articles_per_tag.iteritems()
        }
        self._ndocs_per_tag = {
            tag: len(articles)
            for tag, articles in articles_per_tag.iteritems()
        }
        self._ndocs = sum(self._ndocs_per_tag.values())
        self._vocab = set(t
                          for tag, tokens in self._ntokens_per_tag.iteritems()
                          for t in tokens.keys())
        self._tags = list(self._ntokens_per_tag.keys())
        self._weights = self._compute_weights()

        for tag, tokens in self._ntokens_per_tag.iteritems():
            total = sum(tokens.values())
            with open(
                    '/Users/bernardorufino/pastebin/classifier/{}.dat'.format(
                        tag), 'w') as f:
                for token, n in sorted(tokens.iteritems(),
                                       key=lambda (t, n): n,
                                       reverse=True):
                    f.write("{:<14} {:<5} {:<5.2f} {:<5.2f}\n".format(
                        token, n,
                        float(n) / total, self._weights[token]))
                f.write('\n')
예제 #15
0
    def train(self, data):
        self._processor.process_examples(data)

        articles_per_tag = defaultdict(lambda: [])
        for example in data:
            tag = self.normalize_tag_label(example['tag'])
            if tag in self.IGNORE_TAGS:
                continue
            articles_per_tag[tag].append(example['tokens'])

        self._ntokens_per_tag = {tag: histogram(token for article in articles for token in article)
                                 for tag, articles in articles_per_tag.iteritems()}
        self._ndocs_per_tag = {tag: len(articles) for tag, articles in articles_per_tag.iteritems()}
        self._ndocs = sum(self._ndocs_per_tag.values())
        self._vocab = set(t for tag, tokens in self._ntokens_per_tag.iteritems() for t in tokens.keys())
        self._tags = list(self._ntokens_per_tag.keys())
        self._weights = self._compute_weights()

        for tag, tokens in self._ntokens_per_tag.iteritems():
            total = sum(tokens.values())
            with open('/Users/bernardorufino/pastebin/classifier/{}.dat'.format(tag), 'w') as f:
                for token, n in sorted(tokens.iteritems(), key=lambda (t, n): n, reverse=True):
                    f.write("{:<14} {:<5} {:<5.2f} {:<5.2f}\n".format(token, n, float(n) / total, self._weights[token]))
                f.write('\n')
예제 #16
0
def dram_multiple_contours(img, contours, max_contours=10, approximate=False):
    # draw in blue the contours that were founded
    image_entropy = img.copy()
    cv2.drawContours(img, contours, -1, 255, 3)

    # find the biggest countour (c) by the area
    c = sorted(contours, key=cv2.contourArea, reverse=True)

    # draw the biggest contour (c) in green
    overlap_area = np.zeros((max_contours, 4))
    for i in range(max_contours):
        x, y, w, h = cv2.boundingRect(c[i])

        entropy_computed = (entropy(
            histogram(crop_image(image_entropy, (x, y, w, h)))))
        print(overlap_area)

        if entropy_computed > 7:
            if not overlap(overlap_area, (x, y, w, h), i):
                print(overlap(overlap_area, (x, y, w, h), i))
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

                print(x, y, w, h)
                overlap_area[i, :] = x, y, w, h
예제 #17
0
    for C in C_list_log:
        clf = SVM.gaussian_kernel(label, data, 0.0, 80, 10**C)
        free_SV, free_SV_coef = SVM.free_SV(clf, 10**C)
        SV = SVM.get_SV(clf)
        SV_coef = SVM.get_dual_coef(clf)
        dis = SVM.cal_dis(SV, SV_coef[0], free_SV[0])
        dis_list.append(dis)
    utils.curve(C_list_log, dis_list, '14.png', 'log(C)', 'dis')
    
    # question 15
    gamma_list = [0, 1, 2, 3, 4]
    C = 0.1
    E_out_list = []
    for gamma in gamma_list:
        clf = SVM.gaussian_kernel(label, data, 0.0, 10**gamma, C)
        E_out_list.append(SVM.error_0_1(utils.which_binary(test_label, 0), test_data, clf))
    utils.curve(gamma_list, E_out_list, '15.png', 'log(gamma)', 'E_out')

    # question 16
    C = 0.1
    gamma_list = [-1, 0, 1, 2, 3]
    gamma_pick = [0, 0, 0, 0, 0]
    for i in xrange(100):
        val_label, val_data, train_label, train_data = utils.split_data(label, data, 1000)
        E_val_list = []
        for gamma in gamma_list:
            clf = SVM.gaussian_kernel(train_label, train_data, 0.0, 10**gamma, C)
            E_val_list.append(SVM.error_0_1(utils.which_binary(val_label, 0), val_data, clf))
        gamma_pick[E_val_list.index(max(E_val_list))] += 1
    utils.histogram(gamma_list, gamma_pick, '16.png', 'log(gamma)', '#selected')
예제 #18
0
 def processInputData(self, *args):
     image, label = super(GramHistoResizeModel,
                          self).processInputData(*args)
     return image, histogram(image), label
train_lh, train_prior = naive_bayes.naive_bayes(train_data)

#=====================================
# 2j. plot and predict the movies
#
movies = ['Finding Nemo', 'The Matrix', 'Gone with the Wind', 'Harry Potter and the Goblet of Fire', 'Avatar']
test_movies = findMovie(all_movies, movies)
for tm in test_movies:
    predicted_y = naive_bayes.predict(train_lh, train_prior, utils.bags(tm['summary']))
    minY, maxY = naive_bayes.findMinMaxY(predicted_y)
    x = []
    y = []
    for year in predicted_y:
        x.append(year)
        y.append(predicted_y[year]+abs(predicted_y[minY]))
    utils.histogram(x, y, 'Decade', 'Posterior Probability', tm['title']+' ('+str(tm['year'])+') Histogram of Posterior Probability for each decade')
    print tm['title']+' is done.', 'Predicted decade '+str(maxY), 'Real decade '+str(tm['year'])

#======================================
# 2k. Accuracy measurement
#
accuracy = 0
for d in test_data:
    predicted_y = naive_bayes.predict(train_lh, train_prior, utils.bags(d['summary']))
    minY, maxY = naive_bayes.findMinMaxY(predicted_y)
    if maxY == d['year']:
        accuracy += 1

accuracy /= float(len(test_data))
print 'The accuracy of the model on test data is ', accuracy
예제 #20
0
파일: main.py 프로젝트: julianx/histograms
        3.03, 1.79, 0.78, 0.82, 0.00, 0.92, 0.69, 1.07, 2.26, 0.61, 0.62, 0.00, 1.10, 0.86, 1.17, 0.48, 1.09, 0.53,
        0.94, 0.63, 0.63, 0.86, 0.68, 0.63, 0.49, 0.44, 0.33, 0.28, 0.36, 0.99, 0.49, 0.53, 0.65, 0.49, 0.73, 0.48,
        0.40, 0.90, 0.80, 0.52, 0.67, 0.94, 0.89, 0.69, 0.62, 0.84, 0.29, 0.51, 0.75, 0.52, 0.99, 0.30, 0.36, 0.48,
        0.48, 0.31, 0.38, 0.33, 0.35, 0.50, 1.31, 0.34, 0.43, 0.52, 0.32, 0.56, 0.62, 0.56, 0.79, 0.30, 0.53, 0.36,
        0.47, 0.33, 0.50, 0.63, 0.65, 0.49, 0.42, 0.34, 0.45, 0.53, 5.17, 0.63, 0.61, 0.65, 0.39, 0.53, 0.73, 0.39,
        0.39, 0.29, 0.29, 0.28, 0.47, 0.36, 0.86, 0.53, 0.50, 0.29, 0.45, 0.49, 0.44, 0.25, 0.31, 0.40, 0.63, 0.26,
        0.71, 0.58, 0.57, 0.41, 0.53, 1.16, 0.32, 0.14, 0.15, 0.23, 0.10, 0.15, 1.20, 0.52
)

# %%
# Campaign 0 (Stage 1) Stamps
# Campaign 1 (Stage 2) Pages and stamps <<<
# Campaign 2 (Stage 3) Only pages
# Campaign 4 (Stage 5) Pages and stamps <<<

utils.histogram(data=stage2b, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Stage2 - Time per image - Reproduced results', bins='auto', counter=0)

utils.histogram(data=stage3b, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Stage3 - Time per image - Reproduced results', bins='auto', counter=0)

utils.histogram(data=stage4b, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Campaign4 - Time per image', bins='auto', counter=0)

# %%

utils.histogram(data=stage2_labels, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Stage2 - Time per label', bins='auto', counter=0)

utils.histogram(data=stage3_labels, x_label='Time (mins)', y_label='Frequency',
                title='Labeling for Stage3 - Time per label', bins='auto', counter=0)
예제 #21
0
all_movies = list(
    parse_movies.load_all_movies(os.path.join(config.baseDir,
                                              config.data_file)))

#==============================================
# 2a. PMF of P(Y)
#==============================================
pmf, data_year = cal_pmf(all_movies)
n = len(data_year)
x = []
y = []
for year, amount in pmf.iteritems():
    x.append(year)
    y.append(float(amount) / float(n))
utils.histogram(x, y, 'Decade', 'PMF', 'PMF of P(Y)')
print 'PMF of P(Y) done'

#==============================================
# 2b. PMF of P(Y|X"radio">0)
#==============================================
pmf, data_year = cal_pmf(all_movies, 'radio')
n = len(data_year)
x = []
y = []
for year, amount in pmf.iteritems():
    x.append(year)
    y.append(float(amount) / float(n))
utils.histogram(x, y, 'Decade', 'PMF', 'PMF of P(Y|X"radio">0)')
print 'PMF of P(Y|X"radio">0) done'
예제 #22
0
Created on 7 mar. 2017

Generates the histograms needed in the task
@author: jorge
'''
from utils import histogram
import networkx as nx
import matplotlib.pyplot as plt

path='A1-networks/'
files=['model/ER1000k8.net', 'model/SF_1000_g2.7.net', 'model/ws1000.net', 'real/airports_UW.net']
names=['ER1000k8', 'SF_1000_g2.7', 'ws1000', 'airports_UW']

for i in range(len(files)):
    G=nx.read_pajek(path+files[i])
    plt=histogram(G, log=True, norm=True, n=10)
    plt.title('Log histogram for '+names[i])
    plt.savefig('log_'+names[i]+'.png')
    plt.clf()
    plt=histogram(G, log=False, norm=True, n=10)
    plt.title('Normed histogram for '+ names[i])
    plt.savefig('norm_'+names[i]+'.png')
    plt.clf()
    plt=histogram(G, log=True, norm=True,cumu=-1, n=10)
    plt.title('Cumulative log histogram for '+names[i])
    plt.savefig('Cumu_log_'+names[i]+'.png')
    plt.clf()
    plt=histogram(G, log=False, norm=True,cumu=-1, n=10)
    plt.title('Cumulative normed histogram for '+ names[i])
    plt.savefig('Cumu_norm_'+names[i]+'.png')
    plt.clf()
movies = [
    'Finding Nemo', 'The Matrix', 'Gone with the Wind',
    'Harry Potter and the Goblet of Fire', 'Avatar'
]
test_movies = findMovie(all_movies, movies)
for tm in test_movies:
    predicted_y = naive_bayes.predict(train_lh, train_prior,
                                      utils.bags(tm['summary']))
    minY, maxY = naive_bayes.findMinMaxY(predicted_y)
    x = []
    y = []
    for year in predicted_y:
        x.append(year)
        y.append(predicted_y[year] + abs(predicted_y[minY]))
    utils.histogram(
        x, y, 'Decade', 'Posterior Probability',
        tm['title'] + ' (' + str(tm['year']) +
        ') Histogram of Posterior Probability for each decade')
    print tm['title'] + ' is done.', 'Predicted decade ' + str(
        maxY), 'Real decade ' + str(tm['year'])

#======================================
# 2k. Accuracy measurement
#
accuracy = 0
for d in test_data:
    predicted_y = naive_bayes.predict(train_lh, train_prior,
                                      utils.bags(d['summary']))
    minY, maxY = naive_bayes.findMinMaxY(predicted_y)
    if maxY == d['year']:
        accuracy += 1