def generate_bootstrap_histograms(data, title): """ Generate histograms for the bootstrapped values. Parameters ---------- data: dict, ex. { 'expert1': [ 1, 2, 1, 0, 0.5 ], 'expert2': [ 4, 5.5, 6, 4, 5 ] } title: string, a title of what the distribution is. duh. """ for expert, values in data.iteritems(): ex_name = "".join(char for char in expert if char not in ".,") filename = title + "-" + ex_name filename = filename.strip().lower().replace(" ", "-") utils.histogram( data=values, filename="charts/fantasypros/{}.png".format(filename), title="{} - {}".format(title, expert), figsize=(10, 5), titlesize=26, xsize=26, xlim=(-3, 3), small=True, ) confidence = np.percentile(values, q=[2.5, 50, 97.5]) lower, mid, upper = [round(i, 2) for i in sorted(confidence)] msg = "95% {}: {} +/- {} (Lower: {} Mid: {} Upper: {})" print(msg.format(expert, mid, (mid - lower), lower, mid, upper))
def histogram_matching(img, ref, bins=256): assert img.shape == ref.shape result = img.copy() h, w = img.shape pixels = h * w # histogram hist_img = histogram(img) hist_ref = histogram(ref) # cumulative histogram cum_img = cumulative_histogram(hist_img) cum_ref = cumulative_histogram(hist_ref) # normalization prob_img = cum_img / pixels prob_ref = cum_ref / pixels new_values = np.zeros(bins) for a in range(bins): j = bins - 1 while True: new_values[a] = j j = j - 1 if j < 0 or prob_img[a] >= prob_ref[j]: break for i in range(h): for j in range(w): a = img.item(i, j) b = new_values[a] result.itemset((i, j), b) return result
def generate_bootstrap_histograms(data, title): """ Generate histograms for the bootstrapped values. Parameters ---------- data: dict, ex. { 'expert1': [ 1, 2, 1, 0, 0.5 ], 'expert2': [ 4, 5.5, 6, 4, 5 ] } title: string, a title of what the distribution is. duh. """ for expert, values in data.iteritems(): ex_name = ''.join(char for char in expert if char not in '.,') filename = title + '-' + ex_name filename = filename.strip().lower().replace(' ', '-') utils.histogram( data=values, filename='charts/fantasypros/{}.png'.format(filename), title='{} - {}'.format(title, expert), figsize=(10,5), titlesize=26, xsize=26, xlim=(-3, 3), small=True ) confidence = np.percentile(values, q=[2.5, 50, 97.5]) lower, mid, upper = [round(i, 2) for i in sorted(confidence)] msg = '95% {}: {} +/- {} (Lower: {} Mid: {} Upper: {})' print(msg.format(expert, mid, (mid-lower), lower, mid, upper))
def encode(self, source): """Koduje wejściowy ciąg danych przy pomocy wykładniczego kodu Golomba. Argumenty: source (List[int]): ciąg liczb naturalnych do zakodowania Zwraca: BitStream: strumień bitowy zawierający ciąg słów kodowych oraz opcjonalnie nagłówek (przy pośrednim trybie pracy kodera). """ stream = BitStream() self._source = source self._hist = histogram(source) # Utworzenie i zapisanie w nagłówku książki kodów (jeżeli wybrano tryb pośredni) if not self._direct: self._codebook = self._make_codebook(stream) header_len = len(stream) # Kodowanie danych źródłowych for word in source: self._encode_word(word, stream) # Obliczenie statystyk self._stream_len = len(stream) self._stream_data_len = len(stream) - header_len self._stats = Statistics(self) return stream
def generate_error_histograms(df, column, title): """ Generate actual error distributions for each expert. Plots the distribution of the given column. """ for expert in df.EXPERT.unique().tolist(): ex_name = "".join(char for char in expert if char not in ".,") filename = title + "-" + ex_name filename = filename.strip().lower().replace(" ", "-") utils.histogram( data=df[df.EXPERT == expert][column], filename="charts/fantasypros/{}.png".format(filename), title="{} - {}".format(title, expert), figsize=(10, 5), titlesize=26, xsize=26, xlim=(-40, 40), small=True, )
def generate_error_histograms(df, column, title): """ Generate actual error distributions for each expert. Plots the distribution of the given column. """ for expert in df.EXPERT.unique().tolist(): ex_name = ''.join(char for char in expert if char not in '.,') filename = title + '-' + ex_name filename = filename.strip().lower().replace(' ', '-') utils.histogram( data=df[ df.EXPERT == expert ][column], filename='charts/fantasypros/{}.png'.format(filename), title='{} - {}'.format(title, expert), figsize=(10,5), titlesize=26, xsize=26, xlim=(-40, 40), small=True )
def frequency(data, column, n): counts = histogram(data[column].values) if len(counts) < n: n = len(counts) labels, x = unzip(counts) _, ax = plt.subplots(figsize=(10, 4)) y = list(range(n)) ax.barh(y, x[-n:]) plt.yticks(y, tuple(labels[-n:]), fontsize=7) plt.tight_layout() plt.savefig("pngs/{}_frequency.png".format(column)) plt.close() return counts, n
def main(): logging.basicConfig(level=logging.DEBUG) # Parser and args parser = create_parser() args = parser.parse_args() # Setup resources and dirs dest = open(args.out, 'w') res_dir = os.path.split(os.path.abspath(__file__))[0] template = open(os.path.join(res_dir, 'template.html'), 'r').read() output = HTMLOutput(dest, template) cache_dir = os.path.split(args.out)[0] # Use cache dbs = [FilmwebDatabase()] if not args.force: cache = load_cache(cache_dir, args.out) if cache: logging.info("using cache file") dbs = cache # Get movies movies = find_movies_info(args.dirs, dbs, output, '-rating') # Histogram? if args.histogram: path = os.path.join(cache_dir, '.movierank-histogram.png') histogram(movies, path) output.add_extra('histogram', path) # Finish store_cache(cache_dir, dbs, suffix=args.out) output.flush() # Run browser? if args.run: subprocess.Popen(["xdg-open", args.out], stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
def main(): args = parser.parse_args() with open(args.data, 'r') as f: data_serial = f.read() data_json = json.loads(data_serial) # tag => concatenated articles tagged_corpus_by_articles = defaultdict(lambda: []) for example in data_json: tag = re.sub('\s', '_', example['tag']).lower() tagged_corpus_by_articles[tag].append(example['content']) tagged_corpus = { tag: histogram(' '.join(articles).split()) for tag, articles in tagged_corpus_by_articles.iteritems() } if not args.output: args.output = os.path.join(os.path.dirname(args.data), 'classifier') if not os.path.isdir(args.output): os.makedirs(args.output) vocab = set() for tag, card in tagged_corpus.iteritems(): filepath = os.path.join(args.output, tag) vocab |= set(card.keys()) with open(filepath, 'w+') as f: for w, c in sorted(card.iteritems(), key=lambda (w, c): c, reverse=True): print >> f, "{} {}".format(c, w) print >> f with open(os.path.join(args.output, parameters.PRIORS_FILE), 'w+') as f: for tag, articles in tagged_corpus_by_articles.iteritems(): print >> f, "{} {}".format(len(articles), tag) with open(os.path.join(args.output, parameters.VOCAB_FILE), 'w+') as f: for w in sorted(vocab): print >> f, w
def __init__(self, codec=None): if codec: self._source_len = len(codec._source) self._entropy = entropy(codec._source) self._hist = codec._hist if codec._hist else histogram(codec._source) self._symbol_size = int(math.ceil(math.log(max(self._hist.keys()) or 1, 2))) self._cr = float(self._source_len) * self._symbol_size / codec._stream_len self._mean_code_len = float(codec._stream_data_len) / self._source_len self._source_size = self._symbol_size * self._source_len self._stream_size = codec._stream_len else: self._source_len = 0 self._entropy = 0 self._hist = {} self._cr = 0 self._mean_code_len = 0 self._symbol_len = 0 self._source_size = 0 self._stream_size = 0
def main(): args = parser.parse_args() with open(args.data, 'r') as f: data_serial = f.read() data_json = json.loads(data_serial) # tag => concatenated articles tagged_corpus_by_articles = defaultdict(lambda: []) for example in data_json: tag = re.sub('\s', '_', example['tag']).lower() tagged_corpus_by_articles[tag].append(example['content']) tagged_corpus = {tag: histogram(' '.join(articles).split()) for tag, articles in tagged_corpus_by_articles.iteritems()} if not args.output: args.output = os.path.join(os.path.dirname(args.data), 'classifier') if not os.path.isdir(args.output): os.makedirs(args.output) vocab = set() for tag, card in tagged_corpus.iteritems(): filepath = os.path.join(args.output, tag) vocab |= set(card.keys()) with open(filepath, 'w+') as f: for w, c in sorted(card.iteritems(), key=lambda (w, c): c, reverse=True): print >> f, "{} {}".format(c, w) print >> f with open(os.path.join(args.output, parameters.PRIORS_FILE), 'w+') as f: for tag, articles in tagged_corpus_by_articles.iteritems(): print >> f, "{} {}".format(len(articles), tag) with open(os.path.join(args.output, parameters.VOCAB_FILE), 'w+') as f: for w in sorted(vocab): print >> f, w
def get_shrunk_channels(self, src): shrink = self.options["shrink"] n_orient = self.options["n_orient"] grd_smooth_rad = self.options["grd_smooth_rad"] grd_norm_rad = self.options["grd_norm_rad"] luv = rgb2luv(src) size = (luv.shape[0] / shrink, luv.shape[1] / shrink) channels = [resize(luv, size)] for scale in [1.0, 0.5]: img = resize(luv, (luv.shape[0] * scale, luv.shape[1] * scale)) img = conv_tri(img, grd_smooth_rad) magnitude, orientation = gradient(img, grd_norm_rad) downscale = max(1, int(shrink * scale)) hist = histogram(magnitude, orientation, downscale, n_orient) channels.append(resize(magnitude, size)[:, :, None]) channels.append(resize(hist, size)) channels = N.concatenate(channels, axis=2) reg_smooth_rad = self.options["reg_smooth_rad"] / float(shrink) ss_smooth_rad = self.options["ss_smooth_rad"] / float(shrink) if reg_smooth_rad > 1.0: reg_ch = conv_tri(channels, int(round(reg_smooth_rad))) else: reg_ch = conv_tri(channels, reg_smooth_rad) if ss_smooth_rad > 1.0: ss_ch = conv_tri(channels, int(round(ss_smooth_rad))) else: ss_ch = conv_tri(channels, ss_smooth_rad) return reg_ch, ss_ch
def train(self, data): self._processor.process_examples(data) articles_per_tag = defaultdict(lambda: []) for example in data: tag = self.normalize_tag_label(example['tag']) if tag in self.IGNORE_TAGS: continue articles_per_tag[tag].append(example['tokens']) self._ntokens_per_tag = { tag: histogram(token for article in articles for token in article) for tag, articles in articles_per_tag.iteritems() } self._ndocs_per_tag = { tag: len(articles) for tag, articles in articles_per_tag.iteritems() } self._ndocs = sum(self._ndocs_per_tag.values()) self._vocab = set(t for tag, tokens in self._ntokens_per_tag.iteritems() for t in tokens.keys()) self._tags = list(self._ntokens_per_tag.keys()) self._weights = self._compute_weights() for tag, tokens in self._ntokens_per_tag.iteritems(): total = sum(tokens.values()) with open( '/Users/bernardorufino/pastebin/classifier/{}.dat'.format( tag), 'w') as f: for token, n in sorted(tokens.iteritems(), key=lambda (t, n): n, reverse=True): f.write("{:<14} {:<5} {:<5.2f} {:<5.2f}\n".format( token, n, float(n) / total, self._weights[token])) f.write('\n')
def train(self, data): self._processor.process_examples(data) articles_per_tag = defaultdict(lambda: []) for example in data: tag = self.normalize_tag_label(example['tag']) if tag in self.IGNORE_TAGS: continue articles_per_tag[tag].append(example['tokens']) self._ntokens_per_tag = {tag: histogram(token for article in articles for token in article) for tag, articles in articles_per_tag.iteritems()} self._ndocs_per_tag = {tag: len(articles) for tag, articles in articles_per_tag.iteritems()} self._ndocs = sum(self._ndocs_per_tag.values()) self._vocab = set(t for tag, tokens in self._ntokens_per_tag.iteritems() for t in tokens.keys()) self._tags = list(self._ntokens_per_tag.keys()) self._weights = self._compute_weights() for tag, tokens in self._ntokens_per_tag.iteritems(): total = sum(tokens.values()) with open('/Users/bernardorufino/pastebin/classifier/{}.dat'.format(tag), 'w') as f: for token, n in sorted(tokens.iteritems(), key=lambda (t, n): n, reverse=True): f.write("{:<14} {:<5} {:<5.2f} {:<5.2f}\n".format(token, n, float(n) / total, self._weights[token])) f.write('\n')
def dram_multiple_contours(img, contours, max_contours=10, approximate=False): # draw in blue the contours that were founded image_entropy = img.copy() cv2.drawContours(img, contours, -1, 255, 3) # find the biggest countour (c) by the area c = sorted(contours, key=cv2.contourArea, reverse=True) # draw the biggest contour (c) in green overlap_area = np.zeros((max_contours, 4)) for i in range(max_contours): x, y, w, h = cv2.boundingRect(c[i]) entropy_computed = (entropy( histogram(crop_image(image_entropy, (x, y, w, h))))) print(overlap_area) if entropy_computed > 7: if not overlap(overlap_area, (x, y, w, h), i): print(overlap(overlap_area, (x, y, w, h), i)) cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2) print(x, y, w, h) overlap_area[i, :] = x, y, w, h
for C in C_list_log: clf = SVM.gaussian_kernel(label, data, 0.0, 80, 10**C) free_SV, free_SV_coef = SVM.free_SV(clf, 10**C) SV = SVM.get_SV(clf) SV_coef = SVM.get_dual_coef(clf) dis = SVM.cal_dis(SV, SV_coef[0], free_SV[0]) dis_list.append(dis) utils.curve(C_list_log, dis_list, '14.png', 'log(C)', 'dis') # question 15 gamma_list = [0, 1, 2, 3, 4] C = 0.1 E_out_list = [] for gamma in gamma_list: clf = SVM.gaussian_kernel(label, data, 0.0, 10**gamma, C) E_out_list.append(SVM.error_0_1(utils.which_binary(test_label, 0), test_data, clf)) utils.curve(gamma_list, E_out_list, '15.png', 'log(gamma)', 'E_out') # question 16 C = 0.1 gamma_list = [-1, 0, 1, 2, 3] gamma_pick = [0, 0, 0, 0, 0] for i in xrange(100): val_label, val_data, train_label, train_data = utils.split_data(label, data, 1000) E_val_list = [] for gamma in gamma_list: clf = SVM.gaussian_kernel(train_label, train_data, 0.0, 10**gamma, C) E_val_list.append(SVM.error_0_1(utils.which_binary(val_label, 0), val_data, clf)) gamma_pick[E_val_list.index(max(E_val_list))] += 1 utils.histogram(gamma_list, gamma_pick, '16.png', 'log(gamma)', '#selected')
def processInputData(self, *args): image, label = super(GramHistoResizeModel, self).processInputData(*args) return image, histogram(image), label
train_lh, train_prior = naive_bayes.naive_bayes(train_data) #===================================== # 2j. plot and predict the movies # movies = ['Finding Nemo', 'The Matrix', 'Gone with the Wind', 'Harry Potter and the Goblet of Fire', 'Avatar'] test_movies = findMovie(all_movies, movies) for tm in test_movies: predicted_y = naive_bayes.predict(train_lh, train_prior, utils.bags(tm['summary'])) minY, maxY = naive_bayes.findMinMaxY(predicted_y) x = [] y = [] for year in predicted_y: x.append(year) y.append(predicted_y[year]+abs(predicted_y[minY])) utils.histogram(x, y, 'Decade', 'Posterior Probability', tm['title']+' ('+str(tm['year'])+') Histogram of Posterior Probability for each decade') print tm['title']+' is done.', 'Predicted decade '+str(maxY), 'Real decade '+str(tm['year']) #====================================== # 2k. Accuracy measurement # accuracy = 0 for d in test_data: predicted_y = naive_bayes.predict(train_lh, train_prior, utils.bags(d['summary'])) minY, maxY = naive_bayes.findMinMaxY(predicted_y) if maxY == d['year']: accuracy += 1 accuracy /= float(len(test_data)) print 'The accuracy of the model on test data is ', accuracy
3.03, 1.79, 0.78, 0.82, 0.00, 0.92, 0.69, 1.07, 2.26, 0.61, 0.62, 0.00, 1.10, 0.86, 1.17, 0.48, 1.09, 0.53, 0.94, 0.63, 0.63, 0.86, 0.68, 0.63, 0.49, 0.44, 0.33, 0.28, 0.36, 0.99, 0.49, 0.53, 0.65, 0.49, 0.73, 0.48, 0.40, 0.90, 0.80, 0.52, 0.67, 0.94, 0.89, 0.69, 0.62, 0.84, 0.29, 0.51, 0.75, 0.52, 0.99, 0.30, 0.36, 0.48, 0.48, 0.31, 0.38, 0.33, 0.35, 0.50, 1.31, 0.34, 0.43, 0.52, 0.32, 0.56, 0.62, 0.56, 0.79, 0.30, 0.53, 0.36, 0.47, 0.33, 0.50, 0.63, 0.65, 0.49, 0.42, 0.34, 0.45, 0.53, 5.17, 0.63, 0.61, 0.65, 0.39, 0.53, 0.73, 0.39, 0.39, 0.29, 0.29, 0.28, 0.47, 0.36, 0.86, 0.53, 0.50, 0.29, 0.45, 0.49, 0.44, 0.25, 0.31, 0.40, 0.63, 0.26, 0.71, 0.58, 0.57, 0.41, 0.53, 1.16, 0.32, 0.14, 0.15, 0.23, 0.10, 0.15, 1.20, 0.52 ) # %% # Campaign 0 (Stage 1) Stamps # Campaign 1 (Stage 2) Pages and stamps <<< # Campaign 2 (Stage 3) Only pages # Campaign 4 (Stage 5) Pages and stamps <<< utils.histogram(data=stage2b, x_label='Time (mins)', y_label='Frequency', title='Labeling for Stage2 - Time per image - Reproduced results', bins='auto', counter=0) utils.histogram(data=stage3b, x_label='Time (mins)', y_label='Frequency', title='Labeling for Stage3 - Time per image - Reproduced results', bins='auto', counter=0) utils.histogram(data=stage4b, x_label='Time (mins)', y_label='Frequency', title='Labeling for Campaign4 - Time per image', bins='auto', counter=0) # %% utils.histogram(data=stage2_labels, x_label='Time (mins)', y_label='Frequency', title='Labeling for Stage2 - Time per label', bins='auto', counter=0) utils.histogram(data=stage3_labels, x_label='Time (mins)', y_label='Frequency', title='Labeling for Stage3 - Time per label', bins='auto', counter=0)
all_movies = list( parse_movies.load_all_movies(os.path.join(config.baseDir, config.data_file))) #============================================== # 2a. PMF of P(Y) #============================================== pmf, data_year = cal_pmf(all_movies) n = len(data_year) x = [] y = [] for year, amount in pmf.iteritems(): x.append(year) y.append(float(amount) / float(n)) utils.histogram(x, y, 'Decade', 'PMF', 'PMF of P(Y)') print 'PMF of P(Y) done' #============================================== # 2b. PMF of P(Y|X"radio">0) #============================================== pmf, data_year = cal_pmf(all_movies, 'radio') n = len(data_year) x = [] y = [] for year, amount in pmf.iteritems(): x.append(year) y.append(float(amount) / float(n)) utils.histogram(x, y, 'Decade', 'PMF', 'PMF of P(Y|X"radio">0)') print 'PMF of P(Y|X"radio">0) done'
Created on 7 mar. 2017 Generates the histograms needed in the task @author: jorge ''' from utils import histogram import networkx as nx import matplotlib.pyplot as plt path='A1-networks/' files=['model/ER1000k8.net', 'model/SF_1000_g2.7.net', 'model/ws1000.net', 'real/airports_UW.net'] names=['ER1000k8', 'SF_1000_g2.7', 'ws1000', 'airports_UW'] for i in range(len(files)): G=nx.read_pajek(path+files[i]) plt=histogram(G, log=True, norm=True, n=10) plt.title('Log histogram for '+names[i]) plt.savefig('log_'+names[i]+'.png') plt.clf() plt=histogram(G, log=False, norm=True, n=10) plt.title('Normed histogram for '+ names[i]) plt.savefig('norm_'+names[i]+'.png') plt.clf() plt=histogram(G, log=True, norm=True,cumu=-1, n=10) plt.title('Cumulative log histogram for '+names[i]) plt.savefig('Cumu_log_'+names[i]+'.png') plt.clf() plt=histogram(G, log=False, norm=True,cumu=-1, n=10) plt.title('Cumulative normed histogram for '+ names[i]) plt.savefig('Cumu_norm_'+names[i]+'.png') plt.clf()
movies = [ 'Finding Nemo', 'The Matrix', 'Gone with the Wind', 'Harry Potter and the Goblet of Fire', 'Avatar' ] test_movies = findMovie(all_movies, movies) for tm in test_movies: predicted_y = naive_bayes.predict(train_lh, train_prior, utils.bags(tm['summary'])) minY, maxY = naive_bayes.findMinMaxY(predicted_y) x = [] y = [] for year in predicted_y: x.append(year) y.append(predicted_y[year] + abs(predicted_y[minY])) utils.histogram( x, y, 'Decade', 'Posterior Probability', tm['title'] + ' (' + str(tm['year']) + ') Histogram of Posterior Probability for each decade') print tm['title'] + ' is done.', 'Predicted decade ' + str( maxY), 'Real decade ' + str(tm['year']) #====================================== # 2k. Accuracy measurement # accuracy = 0 for d in test_data: predicted_y = naive_bayes.predict(train_lh, train_prior, utils.bags(d['summary'])) minY, maxY = naive_bayes.findMinMaxY(predicted_y) if maxY == d['year']: accuracy += 1