def buildTree(data): if len(data) <= 0: return node() currentEnt = entropy(data) bestGain = 0.0 bestCriteria = None bestSets = None dimension = len(data[0]) - 1 for feature in range(dimension): feature_values = {} for item in data: feature_values[data[feature]] = 1 for value in feature_values.keys(): set1, set2 = split(data, feature, value) p = len(set1) / len(set2) infoGain = currentEnt - p * entropy(set1) - (1 - p) * entropy(set2) if infoGain > bestGain and len(set1) > 0 and len(set2) > 0: bestGain = infoGain bestCriteria = (feature, value) bestSets = (set1, set2) if bestGain > 0: leftBranch = buildTree(bestSet[0]) rightBranch = buildTree(bestSet[1]) return node(feature=bestCriteria[0], threshold=bestCriteria[1], left=leftBranch, right=rightBranch) else: return node(results=stats(data))
def calc_info_gain(tau, b, c, M, mu): num_items, num_dims = M.shape phats_a = np.empty(num_items) phats_b = np.empty(num_items) for x in xrange(num_items): ma = M[x] mb = M[b] mc = M[c] delta_ab = np.dot(ma - mb, ma - mb) delta_ac = np.dot(ma - mc, ma - mc) delta_bc = np.dot(mb - mc, mb - mc) tri_peri = delta_ab + delta_bc + delta_ac phats_a[x] = (mu + delta_ab + delta_ac) / (2.*mu + 2*(tri_peri)) phats_b[x] = (mu + delta_bc + delta_ab) /(2.*mu + 2*(tri_peri)) p_a = np.sum(tau * phats_a) p_b = np.sum(tau * phats_b) p_c = 1 - p_a - p_b tau_a = tau * phats_a tau_b = tau * phats_b tau_c = tau * (1 - phats_a - phats_b) tau_a /= tau_a.sum() tau_b /= tau_b.sum() tau_c /= tau_c.sum() return entropy(tau) - p_a*entropy(tau_a) - p_b*entropy(tau_b) - (p_c)*entropy(tau_c)
def process_intermediate_output(self, itr, X, y, net): beg = itr * self.batch_size end = beg + self.batch_size weighted = utils.softmax(net.blobs["weighted_input"].data) ip2 = utils.softmax(net.blobs["ip2"].data) confidence = net.blobs["confidence"].data y_predicted = weighted.argmax(axis=1) self.uncertainty[beg:end, 0] = 1-confidence[xrange(y_predicted.shape[0]), y_predicted] self.uncertainty[beg:end, 1] = utils.entropy(confidence) self.uncertainty[beg:end, 2] = utils.entropy(ip2) self.uncertainty[beg:end, 3] = utils.second_max(ip2) self.uncertainty[beg:end, 4] = utils.entropy(weighted) self.uncertainty[beg:end, 5] = utils.second_max(weighted) self.correct[beg:end] = np.equal(y, y_predicted)
def posterior_entropy(self, positive = [], negative = [], remaining_indices = []): indices, y = self.observation_vector(positive,negative) entropies = np.zeros([len(remaining_indices)]) remaining_indices_set = set(remaining_indices) for k, unseen_index in enumerate(remaining_indices): # suppose we observe this element. What is the new entropy ? test_indices = indices + [unseen_index] prob_pos = norm_pdf(np.concatenate([y, [0.5]]), self.covariance[test_indices, :][:, test_indices]) prob_neg = norm_pdf(np.concatenate([y, [-1.0]]), self.covariance[test_indices, :][:, test_indices]) # marginalize: Z = (prob_pos + prob_neg) prob_pos /= Z prob_neg /= Z # get the entropy of remaining elements: post_indices = remaining_indices_set.copy() post_indices.remove(unseen_index) post_probs_pos = self._calculate_conditional_probabilities(np.concatenate([y , [0.5]]), test_indices, post_indices, self.covariance) post_probs_neg = self._calculate_conditional_probabilities(np.concatenate([y , [-.5]]), test_indices, post_indices, self.covariance) # expected entropy: entropies[k] = ( prob_pos * entropy(post_probs_pos) + prob_neg * entropy(post_probs_neg) ) return (remaining_indices, entropies)
def make_money(sender, full_text): text_entropy = entropy(full_text) if 3.6 <= text_entropy <= 4.2: epoch = int(time.time()) power = repeating_digits(epoch) if power: amount = 1000 * 2 ** (power - 1) database.give_money(sender, amount) timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") bux_log.write(f"{timestamp} {epoch} - P{power} E{text_entropy:.2f} - {amount/1000} bux - {sender} - {full_text}")
def findNode(self, address): """ DHT爬虫的客户端至少要实现find_node. 此方法最主要的功能就是不停地让更多人认识自己. 爬虫只需认识(160-2) * K 个节点即可 """ tid = entropy(TID_LENGTH) msg = { "t": tid, "y": "q", "q": "find_node", "a": {"id": self.table.nid, "target": self.snid} } self.sendQuery(msg, address)
def problem2e(train_data): print '2e) Training decision tree using ID3 algorithm' review_samples = generate_unigrams(train_data) positive_counts, top_positive = samples_by_label(review_samples, 500, 1) negative_counts, top_negative = samples_by_label(review_samples, 500, 0) feature_set = derive_features(top_positive, top_negative) print ('\tDecision tree feature/attribute set includes %d total words' % len(feature_set)) print ('\tStarting entropy of the review set with %d samples is %0.5f' % (len(review_samples), utils.entropy(np.array([sample.rating for sample in review_samples])))) #(len(review_samples), utils.entropy([sample.rating for sample in review_samples]))) print print return decision_tree.train(review_samples, feature_set)
def __init__(self, codec=None): if codec: self._source_len = len(codec._source) self._entropy = entropy(codec._source) self._hist = codec._hist if codec._hist else histogram(codec._source) self._symbol_size = int(math.ceil(math.log(max(self._hist.keys()) or 1, 2))) self._cr = float(self._source_len) * self._symbol_size / codec._stream_len self._mean_code_len = float(codec._stream_data_len) / self._source_len self._source_size = self._symbol_size * self._source_len self._stream_size = codec._stream_len else: self._source_len = 0 self._entropy = 0 self._hist = {} self._cr = 0 self._mean_code_len = 0 self._symbol_len = 0 self._source_size = 0 self._stream_size = 0
def sweep(data, model, max_k=10, verbose=True): log_probs, bics, icls = [], [], [] best_responsibilities, best_means, best_phis, best_k = None, None, None, 0 best_bic = -sys.maxint for k in range(1, max_k + 1): means, phis, log_prob, responsibilities = model(data, k, verbose=False) # k classes, each with 1 additional associated param num_params = 2 * k # bic bic = utils.bic(data, log_prob, num_params) # icl = bic minus entropy of responsibilities icl = bic + utils.entropy(responsibilities) log_probs.append(log_prob) bics.append(bic) icls.append(icl) if bic > best_bic: best_k = k best_bic = bic best_responsibilities = responsibilities best_means = means best_phis = phis if verbose: print '\nk: {}\tlog_prob: {:.5f}\tbic: {:.5f}'.format(k, log_prob, bic) print 'phis: {}'.format(phis) print 'means: {}'.format(means) # utils.plot_1d_data_responsibilities(data, responsibilities, means) # utils.plot_data_responsibilities(data, responsibilities, means) return best_k, log_probs, bics, icls, best_responsibilities, best_means, best_phis
def choose_active(net, dataset, num_batches_to_choose): print 'Choosing active samples....' total_num_batches = dataset.num_batches() total_num_samples = len(dataset) num_batches_to_choose = min(num_batches_to_choose, total_num_batches) # reserve storage for outputs uncert = np.zeros(total_num_samples, dtype=np.float32) correct = np.zeros(total_num_samples, dtype=bool) keys = np.zeros(total_num_samples, dtype=np.object) labels = np.zeros(total_num_samples, dtype=np.uint8) dataset.open() for batch_num, batch in enumerate(dataset): utils.wait_bar('Batch number', '', batch_num + 1, total_num_batches) beg = batch_num * net.batch_size end = beg + net.batch_size X, y, batch_keys = batch output = net.forward(X)[0] uncert[beg:end] = utils.entropy(output) correct[beg:end] = np.equal(output.argmax(axis=1), y) keys[beg:end] = batch_keys labels[beg:end] = y dataset.close() print print 'Accuracy = {0}, mean uncertainty = {1}'.format(correct.mean(), uncert.mean()) num_samples_to_choose = num_batches_to_choose * net.batch_size chosen_keys = criterium_balanced(uncert, correct, keys, labels, num_samples_to_choose) num_to_return = (len(chosen_keys) / net.batch_size) * net.batch_size chosen_keys = chosen_keys[:num_to_return] print 'Returning {0} new samples'.format(len(chosen_keys)) return chosen_keys
outputs_chunk.append(out) outputs_chunk = np.vstack(outputs_chunk) outputs_chunk = outputs_chunk[:chunk_length] # truncate to the right length if augment and num_test_tfs > 1: print " average over augmentation transforms" if remainder is not None: # tack on the remainder from the previous iteration outputs_chunk = np.vstack([remainder, outputs_chunk]) l = (outputs_chunk.shape[0] // num_test_tfs) * num_test_tfs remainder = outputs_chunk[l:] # new remainder if avg_method == "avg-probs-ent": # entropy-weighted averaging outputs_chunk = outputs_chunk[:l] h = utils.entropy(outputs_chunk) outputs_chunk *= np.exp(-h)[:, None] outputs_chunk = outputs_chunk.reshape(l // num_test_tfs, num_test_tfs, outputs_chunk.shape[1]).sum(1) z = np.exp(-h).reshape(l // num_test_tfs, num_test_tfs).sum(1) outputs_chunk /= z[:, None] else: outputs_chunk = outputs_chunk[:l].reshape(l // num_test_tfs, num_test_tfs, outputs_chunk.shape[1]).mean(1) outputs.append(outputs_chunk) assert (remainder is None) or remainder.size == 0 # make sure we haven't left any predictions behind outputs = np.vstack(outputs) if avg_method == "avg-logits": print "Passing averaged logits through the softmax"
from numpy import histogram from math import log d = datareader.Data() d.read_from_files() pl = d.get_pid_list(lambda(p): ('nod2' in p and p.health in ['control', 'ileal CD'] and len(p.fractions)>0)) data=d sick = data.get_data(pl, 'health', bucketizer=lambda(v):v=='ileal CD') ent={} for species in data.bacteria: vals = data.get_data(pl, 'fractions', species, bucketizer=lambda(x):int(log(x,10))) ent[species] = utils.entropy(vals) spe=ent.keys() spe.sort(key=lambda x: ent[x], reverse=True) species_by_entropy = spe pvs={} for species in data.bacteria: vals = data.get_data(pl, 'fractions', species) co = utils.findcutoff(vals, sick) if co.sick_when_more==None: continue boolvals = [(i>co.threshold)==(co.sick_when_more) for i in vals] cnts = utils.count(zip(sick, boolvals)) try: pvs[species]=chi2_contingency(cnts)[1]
def add_summary_stats(self, batch, outputs): if self.dir is None: print("No output directory! Results not being recorded.") return supervised = 'annotations' in batch.instances[0].keys() b, nq, ns, nt = outputs['attention'].shape attention_entropy = entropy(outputs['attention'].view(b, nq, ns*nt)) traceback_attention_entropy = entropy(outputs['traceback_attention'].view(b, nq, ns*nt)) columns = ['code_name', 'code_idx', 'attention_entropy', 'traceback_attention_entropy', 'label', 'score', 'depth', 'num_report_sentences', 'num_report_clusters', 'patient_id', 'timepoint_id', # 'reference_sentence_indices', 'reference_sentence_rankings', 'reference_sentence_attention'] 'reference_sentence_indices', 'reference_sentence_rankings', 'sentence_attention'] rows = [] for b in range(len(batch)): patient_id = int(batch.instances[b]['original_reports'].patient_id.iloc[0]) last_report_id = batch.instances[b]['original_reports'].index[-1] tokenized_sentences = batch.instances[b]['tokenized_sentences'] if supervised: annotations = eval(batch.instances[b]['annotations']) for s in range(outputs['num_codes'][b]): code = outputs['codes'][b, s].item() codename = self.code_names[code] attn_ent = attention_entropy[b, s].item() traceback_attn_ent = traceback_attention_entropy[b, s].item() label = outputs['labels'][b, s].item() score = outputs['scores'][b, s].item() if 'scores' in outputs.keys() else None depth = self.hierarchy.depth(codename) num_report_sentences = (outputs['article_sentences_lengths'][b] > 0).sum().item() if supervised: num_report_clusters = len(outputs['clustering'][b][s]) sentences = [' '.join(tokenized_sentences[cluster[0]]) for cluster in outputs['clustering'][b][s]] summary = '\n'.join(sentences[:self.k]) id = len(os.listdir(self.system_dir)) with open(os.path.join(self.system_dir, 'summary_%i_system.txt' % id), 'w') as f: f.write(summary) reference_sentence_indices_set = set([]) for annotator,v in annotations.items(): reference_sentence_indices = [int(i) for i in v['past-reports']['tag_sentences'][codename]] reference_sentence_indices_set.update(reference_sentence_indices) reference = '\n'.join([' '.join(tokenized_sentences[i]) for i in reference_sentence_indices]) with open(os.path.join(self.reference_dir, 'summary_%i_%s.txt' % (id, annotator)), 'w') as f: f.write(reference) sentence_to_ranking = {sentence_idx:i for i in range(len(sentences)) for sentence_idx in outputs['clustering'][b][s][i]} reference_sentence_indices = sorted(list(reference_sentence_indices_set)) reference_sentence_rankings = [sentence_to_ranking[i] for i in sorted(list(reference_sentence_indices_set))] # reference_sentence_attention = [outputs['attention'][b, s, i].sum().item() for i in sorted(list(reference_sentence_indices_set))] sentence_attention = [outputs['attention'][b, s, outputs['clustering'][b][s][i][0]].sum().item() for i in range(len(sentences))] else: num_report_clusters = None reference_sentence_indices = None reference_sentence_rankings = None # reference_sentence_attention = None sentence_attention = None # NOTE: cannot include summaries here because this file might be emailed and summaries contain phi! rows.append([ codename, code, attn_ent, traceback_attn_ent, label, score, depth, num_report_sentences, num_report_clusters, patient_id, last_report_id, reference_sentence_indices, reference_sentence_rankings, # reference_sentence_attention, sentence_attention, ]) df = pd.DataFrame(rows, columns=columns) file = os.path.join(self.dir, 'summary_stats.csv') header = True if not os.path.exists(file) else False df.to_csv(file, mode='a', header=header)
def test_entropy(self): tutils.raises(ValueError, utils.entropy, "foo", 64, 0) assert utils.entropy("a"*64, 64, 1) == 0 d = "".join([chr(i) for i in range(256)]) assert utils.entropy(d, 64, 1) == 1
import numpy as np from collections import defaultdict from scipy.special import logsumexp from utils import k_nearest_interpolation, entropy if __name__ == "__main__": labels = ['a', 'b', 'c'] dists = [1, 2, 3] probs = k_nearest_interpolation(dists, labels) print('Probabilities: %r' % probs) print('Entropy: %.2f' % entropy(probs)) # Note that you could have many nearest neighbours that have the same label labels = ['a', 'a', 'b', 'c'] dists = [1, 2, 2, 3] probs = k_nearest_interpolation(dists, labels) print('Probabilities: %r' % probs) print('Entropy: %.2f' % entropy(probs))
def main(classification=True, tournament=False, x_validation=False, devset=False, randomize=False, verbose=False, plot_roc=False, multiprocessing=False): """main function""" if devset: print("Using development dataset") set_file = '../data/devset.json' wocc_file = '../data/devset_words_occurrence.txt' else: print("Using full dataset") set_file = '../data/dataset.json' wocc_file = '../data/dataset_words_occurrence.txt' print("Collecting data...") dataset = u.json_to_tweets(set_file, False) if verbose: f = "%Y-%m-%d" oldest_tweet, newest_tweet = u.tweets_date_range(dataset) print("Oldest tweet was posted on %s") % (oldest_tweet).strftime(f) print("Newest tweet was posted on %s") % (newest_tweet).strftime(f) print("Date range is %d day(s)") % (newest_tweet - oldest_tweet).days print("Loading words occurrencies...") words_occ = u.words_occ_to_dict(wocc_file) print("Computing term frequency...") words_tf = u.words_occ_to_tf(words_occ) print("Computing term frequency - inverse document frequency...") words_tf_idf = u.words_occ_to_tfidf(words_occ) if randomize: print("Randomizing dataset...") random.shuffle(dataset) # list of objects containing the feature classes feat_objs = [ #fake_feature.FakeFeature(), is_a_retweet_feature.IsARetweetFeature(), is_a_reply_feature.IsAReplyFeature(), followers_count_feature.FollowersCountFeature(), #tweet_age_feature.TweetAgeFeature(), tweet_length_feature.TweetLengthFeature(), statuses_count_feature.StatusesCountFeature(), hashtag_count_feature.HashtagCountFeature(), user_mentions_count_feature.UserMentionsCountFeature(), favorite_count_feature.FavoriteCountFeature(), has_url_feature.HasUrlFeature(), friends_count_feature.FriendsCountFeature(), #verified_account_feature.VerifiedAccountFeature(), #tf_feature.Tf(data=words_tf), #tf_idf_feature.TfIdf(data=words_tf_idf) ] # list of objects containing the classifier classes classif_objs = [ #nb.NaiveBayes(plot_roc), nbs.NaiveBayesScikit(plot_roc), svm_rbf.SVMRBF(plot_roc), svm_sigmoid.SVMSigmoid(plot_roc), #svm_poly.SVMPoly(plot_roc), #svm_linear.SVMLinear(plot_roc), #me.MaxEnt(plot_roc), mes.MaxEntScikit(plot_roc), dts.DecisionTreeScikit(plot_roc), #dt.DecisionTree(plot_roc), mv.MajorityVote(plot_roc), lda.LDA(plot_roc) ] if verbose: print("\nFeatures activated:") for feat in feat_objs: print("- %s") % (str(feat)) print("\nClassifiers used:") for cl in classif_objs: print("- %s") % (str(cl)) print("") # extract features and build a list of instances instances, labels = extract_instances(dataset, feat_objs) # TODO : make the feature selection optional fs.FeaturesSelection.chi2(instances, labels) print("\nEntropy of the labels:") print(u.entropy(labels)) print("") if classification: size = int(math.floor(len(instances)*0.25)) train_data = instances[0:-size+1] test_data = instances[-size+1:] train_labels = labels[0:-size+1] test_labels = labels[-size+1:] if multiprocessing: c_thread = Process(target=classification_routine, args=(train_data, test_data, train_labels, test_labels, classif_objs)) print("Starting classification thread...") c_thread.start() else: classification_routine(train_data, test_data, train_labels, test_labels, classif_objs) if x_validation: if multiprocessing: xv_thread = Process(target=cross_validation, args=(instances, labels, classif_objs)) print("Starting cross-validation thread...") xv_thread.start() else: ave,_,_ = cross_validation(instances, labels, classif_objs) print('average accuracy :') print(ave) if tournament: if multiprocessing: t_thread = Process(target=algorithm_tournament, args=(instances, labels, classif_objs)) print("Starting tournament thread...") t_thread.start() else: algorithm_tournament(instances, labels, classif_objs) if multiprocessing: if classification: c_thread.join() if x_validation: xv_thread.join() if tournament: t_thread.join()
def unigram_entropy(self): """return entropy of this unigram distribution in bits""" return entropy(self.relcounts[1])
optimizer.zero_grad() # clear gradient buffers # compute loss target_values = fitness_list.mean(1).view(-1, 1) critic_loss = value_loss(values, target_values) logp = models.log_probs(samples, policy) print(logp) print(critic_loss) quit() loss = (logp * critic_loss).mean() loss.backward() # update gradient buffers optimizer.step() # update model's parameters # --------------------- logger --------------------- # with torch.no_grad(): h = utils.entropy(policy) dl.push(fitness_list=fitness_list.cpu().numpy()) dl.push(other={'iteration': it, 'entropy': h.item(), 'loss': loss.item(), }) print(it+1, '/', ITERS, end=' ') dl.print() if LOG_FILE != None: dl.to_csv(LOG_FILE, ITERS) # dl.plot()