예제 #1
0
def buildTree(data):
    if len(data) <= 0: return node()

    currentEnt = entropy(data)
    bestGain = 0.0
    bestCriteria = None
    bestSets = None

    dimension = len(data[0]) - 1

    for feature in range(dimension):
        feature_values = {}
        for item in data:
            feature_values[data[feature]] = 1
        for value in feature_values.keys():
            set1, set2 = split(data, feature, value)
            p = len(set1) / len(set2)
            infoGain = currentEnt - p * entropy(set1) - (1 - p) * entropy(set2)
            if infoGain > bestGain and len(set1) > 0 and len(set2) > 0:
                bestGain = infoGain
                bestCriteria = (feature, value)
                bestSets = (set1, set2)

    if bestGain > 0:
        leftBranch = buildTree(bestSet[0])
        rightBranch = buildTree(bestSet[1])
        return node(feature=bestCriteria[0], threshold=bestCriteria[1], left=leftBranch, right=rightBranch)
    else:
        return node(results=stats(data))
예제 #2
0
def calc_info_gain(tau, b, c, M, mu):
    num_items, num_dims = M.shape
    phats_a = np.empty(num_items)
    phats_b = np.empty(num_items)
    for x in xrange(num_items):
        ma = M[x]
        mb = M[b]
        mc = M[c]
        delta_ab = np.dot(ma - mb, ma - mb)
        delta_ac = np.dot(ma - mc, ma - mc)
        delta_bc = np.dot(mb - mc, mb - mc)
        tri_peri = delta_ab + delta_bc + delta_ac

        phats_a[x] = (mu + delta_ab + delta_ac) / (2.*mu + 2*(tri_peri))
        phats_b[x] = (mu + delta_bc + delta_ab) /(2.*mu + 2*(tri_peri))

    p_a = np.sum(tau * phats_a)
    p_b = np.sum(tau * phats_b)
    p_c = 1 - p_a - p_b

    tau_a = tau * phats_a
    tau_b = tau * phats_b
    tau_c = tau * (1 - phats_a - phats_b)
    tau_a /= tau_a.sum()
    tau_b /= tau_b.sum()
    tau_c /= tau_c.sum()

    return entropy(tau) - p_a*entropy(tau_a) - p_b*entropy(tau_b) - (p_c)*entropy(tau_c)
예제 #3
0
    def process_intermediate_output(self, itr, X, y, net):
        beg = itr * self.batch_size
        end = beg + self.batch_size

        weighted = utils.softmax(net.blobs["weighted_input"].data)
        ip2 = utils.softmax(net.blobs["ip2"].data)
        confidence = net.blobs["confidence"].data

        y_predicted = weighted.argmax(axis=1)

        self.uncertainty[beg:end, 0] = 1-confidence[xrange(y_predicted.shape[0]), y_predicted]
        self.uncertainty[beg:end, 1] = utils.entropy(confidence)
        self.uncertainty[beg:end, 2] = utils.entropy(ip2)
        self.uncertainty[beg:end, 3] = utils.second_max(ip2)
        self.uncertainty[beg:end, 4] = utils.entropy(weighted)
        self.uncertainty[beg:end, 5] = utils.second_max(weighted)

        self.correct[beg:end] = np.equal(y, y_predicted)
    def posterior_entropy(self,
                                  positive = [], 
                                  negative = [], 
                                  remaining_indices = []):
        
        indices, y = self.observation_vector(positive,negative)
        entropies = np.zeros([len(remaining_indices)])
        remaining_indices_set = set(remaining_indices)

        for k, unseen_index in enumerate(remaining_indices):
            # suppose we observe this element. What is the new entropy ?
            
            test_indices = indices + [unseen_index]
            
            prob_pos = norm_pdf(np.concatenate([y, [0.5]]),
                     self.covariance[test_indices, :][:, test_indices])
            prob_neg = norm_pdf(np.concatenate([y, [-1.0]]),
                     self.covariance[test_indices, :][:, test_indices])
            
            # marginalize:
            Z = (prob_pos + prob_neg)
            prob_pos /= Z
            prob_neg /= Z
            
            # get the entropy of remaining elements:
            post_indices = remaining_indices_set.copy()
            post_indices.remove(unseen_index)
            post_probs_pos = self._calculate_conditional_probabilities(np.concatenate([y , [0.5]]),
                                                                       test_indices,
                                                                       post_indices,
                                                                       self.covariance)
            post_probs_neg = self._calculate_conditional_probabilities(np.concatenate([y , [-.5]]),
                                                                       test_indices,
                                                                       post_indices,
                                                                       self.covariance)
            # expected entropy:
            entropies[k] = (
                prob_pos * entropy(post_probs_pos) +
                prob_neg * entropy(post_probs_neg)
            )
        return (remaining_indices, entropies)
예제 #5
0
파일: bank.py 프로젝트: superseal/raisin
def make_money(sender, full_text):
    text_entropy = entropy(full_text)

    if 3.6 <= text_entropy <= 4.2:
        epoch = int(time.time())
        power = repeating_digits(epoch)
        if power:
            amount = 1000 * 2 ** (power - 1)
            database.give_money(sender, amount)

            timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
            bux_log.write(f"{timestamp} {epoch} - P{power} E{text_entropy:.2f} - {amount/1000} bux - {sender} - {full_text}")
예제 #6
0
파일: kdht.py 프로젝트: 89sos98/simDHT
 def findNode(self, address):
     """
     DHT爬虫的客户端至少要实现find_node.
     此方法最主要的功能就是不停地让更多人认识自己.
     爬虫只需认识(160-2) * K 个节点即可
     """
     tid = entropy(TID_LENGTH)
     msg = {
         "t": tid,
         "y": "q",
         "q": "find_node",
         "a": {"id": self.table.nid, "target": self.snid}
     }
     self.sendQuery(msg, address)
예제 #7
0
파일: problem2.py 프로젝트: od0/HW2
def problem2e(train_data):
    print '2e) Training decision tree using ID3 algorithm'
    review_samples = generate_unigrams(train_data)
    positive_counts, top_positive = samples_by_label(review_samples, 500, 1)
    negative_counts, top_negative = samples_by_label(review_samples, 500, 0)

    feature_set = derive_features(top_positive, top_negative)
    print ('\tDecision tree feature/attribute set includes %d total words' % len(feature_set))
    print ('\tStarting entropy of the review set with %d samples is %0.5f' %
        (len(review_samples), utils.entropy(np.array([sample.rating for sample in review_samples]))))
           #(len(review_samples), utils.entropy([sample.rating for sample in review_samples])))
    print
    print

    return decision_tree.train(review_samples, feature_set)
예제 #8
0
파일: codec.py 프로젝트: zbanach/koda
 def __init__(self, codec=None):
     if codec:
         self._source_len = len(codec._source)
         self._entropy = entropy(codec._source)
         self._hist = codec._hist if codec._hist else histogram(codec._source)
         self._symbol_size = int(math.ceil(math.log(max(self._hist.keys()) or 1, 2)))
         self._cr = float(self._source_len) * self._symbol_size / codec._stream_len
         self._mean_code_len = float(codec._stream_data_len) / self._source_len
         self._source_size = self._symbol_size * self._source_len
         self._stream_size = codec._stream_len
     else:
         self._source_len = 0
         self._entropy = 0
         self._hist = {}
         self._cr = 0
         self._mean_code_len = 0
         self._symbol_len = 0
         self._source_size = 0
         self._stream_size = 0
예제 #9
0
def sweep(data, model, max_k=10, verbose=True):

    log_probs, bics, icls = [], [], []
    best_responsibilities, best_means, best_phis, best_k = None, None, None, 0
    best_bic = -sys.maxint
    for k in range(1, max_k + 1):
        means, phis, log_prob, responsibilities = model(data, k, verbose=False)

        # k classes, each with 1 additional associated param
        num_params = 2 * k

        # bic 
        bic = utils.bic(data, log_prob, num_params)

        # icl = bic minus entropy of responsibilities
        icl = bic + utils.entropy(responsibilities)

        log_probs.append(log_prob)
        bics.append(bic)
        icls.append(icl)

        if bic > best_bic:
            best_k = k
            best_bic = bic
            best_responsibilities = responsibilities
            best_means = means
            best_phis = phis

        if verbose:
            print '\nk: {}\tlog_prob: {:.5f}\tbic: {:.5f}'.format(k, log_prob, bic)
            print 'phis: {}'.format(phis)
            print 'means: {}'.format(means)
            # utils.plot_1d_data_responsibilities(data, responsibilities, means)
            # utils.plot_data_responsibilities(data, responsibilities, means)

    return best_k, log_probs, bics, icls, best_responsibilities, best_means, best_phis

    
예제 #10
0
def choose_active(net, dataset, num_batches_to_choose):
    print 'Choosing active samples....'

    total_num_batches = dataset.num_batches()
    total_num_samples = len(dataset)
    num_batches_to_choose = min(num_batches_to_choose, total_num_batches)

    # reserve storage for outputs
    uncert = np.zeros(total_num_samples, dtype=np.float32)
    correct = np.zeros(total_num_samples, dtype=bool)
    keys = np.zeros(total_num_samples, dtype=np.object)
    labels = np.zeros(total_num_samples, dtype=np.uint8)

    dataset.open()
    for batch_num, batch in enumerate(dataset):
        utils.wait_bar('Batch number', '', batch_num + 1, total_num_batches)
        beg = batch_num * net.batch_size
        end = beg + net.batch_size

        X, y, batch_keys = batch
        output = net.forward(X)[0]
        uncert[beg:end] = utils.entropy(output)
        correct[beg:end] = np.equal(output.argmax(axis=1), y)
        keys[beg:end] = batch_keys
        labels[beg:end] = y

    dataset.close()
    print
    print 'Accuracy = {0}, mean uncertainty = {1}'.format(correct.mean(), uncert.mean())

    num_samples_to_choose = num_batches_to_choose * net.batch_size
    chosen_keys = criterium_balanced(uncert, correct, keys, labels, num_samples_to_choose)

    num_to_return = (len(chosen_keys) / net.batch_size) * net.batch_size
    chosen_keys = chosen_keys[:num_to_return]
    print 'Returning {0} new samples'.format(len(chosen_keys))
    return chosen_keys
예제 #11
0
        outputs_chunk.append(out)

    outputs_chunk = np.vstack(outputs_chunk)
    outputs_chunk = outputs_chunk[:chunk_length] # truncate to the right length

    if augment and num_test_tfs > 1:
        print "  average over augmentation transforms"
        if remainder is not None: # tack on the remainder from the previous iteration
            outputs_chunk = np.vstack([remainder, outputs_chunk])

        l = (outputs_chunk.shape[0] // num_test_tfs) * num_test_tfs
        remainder = outputs_chunk[l:] # new remainder

        if avg_method == "avg-probs-ent": # entropy-weighted averaging
            outputs_chunk = outputs_chunk[:l]
            h = utils.entropy(outputs_chunk)
            outputs_chunk *= np.exp(-h)[:, None]
            outputs_chunk = outputs_chunk.reshape(l // num_test_tfs, num_test_tfs, outputs_chunk.shape[1]).sum(1)
            z = np.exp(-h).reshape(l // num_test_tfs, num_test_tfs).sum(1)
            outputs_chunk /= z[:, None]
        else:
            outputs_chunk = outputs_chunk[:l].reshape(l // num_test_tfs, num_test_tfs, outputs_chunk.shape[1]).mean(1)

    outputs.append(outputs_chunk)

assert (remainder is None) or remainder.size == 0 # make sure we haven't left any predictions behind
outputs = np.vstack(outputs)


if avg_method == "avg-logits":
    print "Passing averaged logits through the softmax"
예제 #12
0
from numpy import histogram
from math import log

d = datareader.Data()
d.read_from_files()
pl = d.get_pid_list(lambda(p): ('nod2' in p and 
                                   p.health in ['control', 'ileal CD'] and
                                   len(p.fractions)>0))
data=d

sick = data.get_data(pl, 'health', bucketizer=lambda(v):v=='ileal CD')

ent={}
for species in data.bacteria:
    vals = data.get_data(pl, 'fractions', species, bucketizer=lambda(x):int(log(x,10)))
    ent[species] = utils.entropy(vals)

spe=ent.keys()
spe.sort(key=lambda x: ent[x], reverse=True)
species_by_entropy = spe

pvs={}
for species in data.bacteria:
     vals = data.get_data(pl, 'fractions', species)
     co = utils.findcutoff(vals, sick)
     if co.sick_when_more==None:
         continue
     boolvals = [(i>co.threshold)==(co.sick_when_more) for i in vals]
     cnts = utils.count(zip(sick, boolvals))
     try:
         pvs[species]=chi2_contingency(cnts)[1]
예제 #13
0
    def add_summary_stats(self, batch, outputs):
        if self.dir is None:
            print("No output directory! Results not being recorded.")
            return
        supervised = 'annotations' in batch.instances[0].keys()
        b, nq, ns, nt = outputs['attention'].shape
        attention_entropy = entropy(outputs['attention'].view(b, nq, ns*nt))
        traceback_attention_entropy = entropy(outputs['traceback_attention'].view(b, nq, ns*nt))
        columns = ['code_name', 'code_idx', 'attention_entropy', 'traceback_attention_entropy', 'label', 'score', 'depth',
                   'num_report_sentences', 'num_report_clusters', 'patient_id', 'timepoint_id',
#                   'reference_sentence_indices', 'reference_sentence_rankings', 'reference_sentence_attention']
                   'reference_sentence_indices', 'reference_sentence_rankings', 'sentence_attention']
        rows = []
        for b in range(len(batch)):
            patient_id = int(batch.instances[b]['original_reports'].patient_id.iloc[0])
            last_report_id = batch.instances[b]['original_reports'].index[-1]
            tokenized_sentences = batch.instances[b]['tokenized_sentences']
            if supervised:
                annotations = eval(batch.instances[b]['annotations'])
            for s in range(outputs['num_codes'][b]):
                code = outputs['codes'][b, s].item()
                codename = self.code_names[code]
                attn_ent = attention_entropy[b, s].item()
                traceback_attn_ent = traceback_attention_entropy[b, s].item()
                label = outputs['labels'][b, s].item()
                score = outputs['scores'][b, s].item() if 'scores' in outputs.keys() else None
                depth = self.hierarchy.depth(codename)
                num_report_sentences = (outputs['article_sentences_lengths'][b] > 0).sum().item()
                if supervised:
                    num_report_clusters = len(outputs['clustering'][b][s])
                    sentences = [' '.join(tokenized_sentences[cluster[0]]) for cluster in outputs['clustering'][b][s]]
                    summary = '\n'.join(sentences[:self.k])
                    id = len(os.listdir(self.system_dir))
                    with open(os.path.join(self.system_dir, 'summary_%i_system.txt' % id), 'w') as f:
                        f.write(summary)
                    reference_sentence_indices_set = set([])
                    for annotator,v in annotations.items():
                        reference_sentence_indices = [int(i) for i in v['past-reports']['tag_sentences'][codename]]
                        reference_sentence_indices_set.update(reference_sentence_indices)
                        reference = '\n'.join([' '.join(tokenized_sentences[i]) for i in reference_sentence_indices])
                        with open(os.path.join(self.reference_dir, 'summary_%i_%s.txt' % (id, annotator)), 'w') as f:
                            f.write(reference)
                    sentence_to_ranking = {sentence_idx:i for i in range(len(sentences)) for sentence_idx in outputs['clustering'][b][s][i]}
                    reference_sentence_indices = sorted(list(reference_sentence_indices_set))
                    reference_sentence_rankings = [sentence_to_ranking[i] for i in sorted(list(reference_sentence_indices_set))]
#                    reference_sentence_attention = [outputs['attention'][b, s, i].sum().item() for i in sorted(list(reference_sentence_indices_set))]
                    sentence_attention = [outputs['attention'][b, s, outputs['clustering'][b][s][i][0]].sum().item() for i in range(len(sentences))]
                else:
                    num_report_clusters = None
                    reference_sentence_indices = None
                    reference_sentence_rankings = None
#                    reference_sentence_attention = None
                    sentence_attention = None
                # NOTE: cannot include summaries here because this file might be emailed and summaries contain phi!
                rows.append([
                    codename,
                    code,
                    attn_ent,
                    traceback_attn_ent,
                    label,
                    score,
                    depth,
                    num_report_sentences,
                    num_report_clusters,
                    patient_id,
                    last_report_id,
                    reference_sentence_indices,
                    reference_sentence_rankings,
#                    reference_sentence_attention,
                    sentence_attention,
                ])
        df = pd.DataFrame(rows, columns=columns)
        file = os.path.join(self.dir, 'summary_stats.csv')
        header = True if not os.path.exists(file) else False
        df.to_csv(file, mode='a', header=header)
예제 #14
0
 def test_entropy(self):
     tutils.raises(ValueError, utils.entropy, "foo", 64, 0)
     assert utils.entropy("a"*64, 64, 1) == 0
     d = "".join([chr(i) for i in range(256)])
     assert utils.entropy(d, 64, 1) == 1
예제 #15
0
import numpy as np

from collections import defaultdict
from scipy.special import logsumexp
from utils import k_nearest_interpolation, entropy

if __name__ == "__main__":

    labels = ['a', 'b', 'c']
    dists = [1, 2, 3]
    probs = k_nearest_interpolation(dists, labels)
    print('Probabilities: %r' % probs)
    print('Entropy: %.2f' % entropy(probs))

    # Note that you could have many nearest neighbours that have the same label
    labels = ['a', 'a', 'b', 'c']
    dists = [1, 2, 2, 3]
    probs = k_nearest_interpolation(dists, labels)
    print('Probabilities: %r' % probs)
    print('Entropy: %.2f' % entropy(probs))
예제 #16
0
파일: main.py 프로젝트: Rolinh/tweetmining
def main(classification=True,
         tournament=False,
         x_validation=False,
         devset=False,
         randomize=False,
         verbose=False,
         plot_roc=False,
         multiprocessing=False):
    """main function"""

    if devset:
        print("Using development dataset")
        set_file = '../data/devset.json'
        wocc_file = '../data/devset_words_occurrence.txt'
    else:
        print("Using full dataset")
        set_file = '../data/dataset.json'
        wocc_file = '../data/dataset_words_occurrence.txt'

    print("Collecting data...")
    dataset = u.json_to_tweets(set_file, False)

    if verbose:
        f = "%Y-%m-%d"
        oldest_tweet, newest_tweet = u.tweets_date_range(dataset)
        print("Oldest tweet was posted on %s") % (oldest_tweet).strftime(f)
        print("Newest tweet was posted on %s") % (newest_tweet).strftime(f)
        print("Date range is %d day(s)") % (newest_tweet - oldest_tweet).days

    print("Loading words occurrencies...")
    words_occ = u.words_occ_to_dict(wocc_file)
    print("Computing term frequency...")
    words_tf = u.words_occ_to_tf(words_occ)
    print("Computing term frequency - inverse document frequency...")
    words_tf_idf = u.words_occ_to_tfidf(words_occ)

    if randomize:
        print("Randomizing dataset...")
        random.shuffle(dataset)

    # list of objects containing the feature classes
    feat_objs    = [
                    #fake_feature.FakeFeature(),
                    is_a_retweet_feature.IsARetweetFeature(),
                    is_a_reply_feature.IsAReplyFeature(),
                    followers_count_feature.FollowersCountFeature(),
                    #tweet_age_feature.TweetAgeFeature(),
                    tweet_length_feature.TweetLengthFeature(),
                    statuses_count_feature.StatusesCountFeature(),
                    hashtag_count_feature.HashtagCountFeature(),
                    user_mentions_count_feature.UserMentionsCountFeature(),
                    favorite_count_feature.FavoriteCountFeature(),
                    has_url_feature.HasUrlFeature(),
                    friends_count_feature.FriendsCountFeature(),
                    #verified_account_feature.VerifiedAccountFeature(),
                    #tf_feature.Tf(data=words_tf),
                    #tf_idf_feature.TfIdf(data=words_tf_idf)
                    ]

    # list of objects containing the classifier classes
    classif_objs = [
                    #nb.NaiveBayes(plot_roc),
                    nbs.NaiveBayesScikit(plot_roc),
                    svm_rbf.SVMRBF(plot_roc),
                    svm_sigmoid.SVMSigmoid(plot_roc),
                    #svm_poly.SVMPoly(plot_roc),
                    #svm_linear.SVMLinear(plot_roc),
                    #me.MaxEnt(plot_roc),
                    mes.MaxEntScikit(plot_roc),
                    dts.DecisionTreeScikit(plot_roc),
                    #dt.DecisionTree(plot_roc),
                    mv.MajorityVote(plot_roc),
                    lda.LDA(plot_roc)
                    ]
    if verbose:
        print("\nFeatures activated:")
        for feat in feat_objs:
            print("- %s") % (str(feat))
        print("\nClassifiers used:")
        for cl in classif_objs:
            print("- %s") % (str(cl))
        print("")

    # extract features and build a list of instances
    instances, labels = extract_instances(dataset, feat_objs)

    # TODO : make the feature selection optional
    fs.FeaturesSelection.chi2(instances, labels)

    print("\nEntropy of the labels:")
    print(u.entropy(labels))
    print("")

    if classification:
        size         = int(math.floor(len(instances)*0.25))
        train_data   = instances[0:-size+1]
        test_data    = instances[-size+1:]
        train_labels = labels[0:-size+1]
        test_labels  = labels[-size+1:]

        if multiprocessing:
            c_thread = Process(target=classification_routine,
                               args=(train_data, test_data, train_labels,
                                     test_labels, classif_objs))
            print("Starting classification thread...")
            c_thread.start()
        else:
            classification_routine(train_data, test_data, train_labels,
                                   test_labels, classif_objs)

    if x_validation:
        if multiprocessing:
            xv_thread = Process(target=cross_validation,
                                args=(instances, labels, classif_objs))
            print("Starting cross-validation thread...")
            xv_thread.start()
        else:
            ave,_,_ = cross_validation(instances, labels, classif_objs)
            print('average accuracy :')
            print(ave)

    if tournament:
        if multiprocessing:
            t_thread = Process(target=algorithm_tournament,
                               args=(instances, labels, classif_objs))
            print("Starting tournament thread...")
            t_thread.start()
        else:
            algorithm_tournament(instances, labels, classif_objs)

    if multiprocessing:
        if classification:
            c_thread.join()
        if x_validation:
            xv_thread.join()
        if tournament:
            t_thread.join()
예제 #17
0
 def unigram_entropy(self):
     """return entropy of this unigram distribution in bits"""
     return entropy(self.relcounts[1])
예제 #18
0
    optimizer.zero_grad()  # clear gradient buffers

    # compute loss
    target_values = fitness_list.mean(1).view(-1, 1)
    critic_loss = value_loss(values, target_values)
    logp = models.log_probs(samples, policy)
    print(logp)
    print(critic_loss)
    quit()

    loss = (logp * critic_loss).mean()

    loss.backward()  # update gradient buffers
    optimizer.step()  # update model's parameters

    # --------------------- logger --------------------- #
    with torch.no_grad():
        h = utils.entropy(policy)
    dl.push(fitness_list=fitness_list.cpu().numpy())
    dl.push(other={'iteration': it,
                   'entropy': h.item(),
                   'loss': loss.item(),
                   })
    print(it+1, '/', ITERS, end=' ')
    dl.print()

if LOG_FILE != None:
    dl.to_csv(LOG_FILE, ITERS)
# dl.plot()