def mutual_information(x,y,nbins=20): Hx = entropy(x,nbins) Hy = entropy(y,nbins) Hxy = joint_entropy(x,y,nbins) MI = Hx+Hy-Hxy NMI = 2*MI/(Hx + Hy) return MI, NMI
def information_gain(array_source, array_children_list, criterion='gini'): """Computes the information gain between the first and second array using the criterion 'gini' or 'entropy' 333""" if isinstance(array_source, np.ndarray) == 1 and isinstance( array_children_list, np.ndarray) == 1: if criterion == "gini" or criterion == "entropy": if criterion == "gini": So = gini(array_source) q = len(array_children_list) N = len(array_source) somme = 0.0 for i in range(q): somme += (len(array_children_list / N) * gini(array_children_list)) IG = So - somme return (IG) else: So = entropy(array_source) q = len(array_children_list) N = len(array_source) somme = 0.0 for i in range(q): somme += (len(array_children_list / N) * entropy(array_children_list)) IG = So - somme return (IG) else: print("info_gain: error in children list or criterion type") else: print("info_gain: error in type of array")
def information_gain(array_source, array_children_list, criterion='gini'): # try: # if isinstance(array_,np.ndarray) and array_.size == 0: # return None # except: # return None # if (isinstance(array_, list)): # array_ = np.array(array_) # N = float(array_.size) # unique , counts= np.unique(array_,return_counts=True) # val = dict(zip(unique,counts)) # dct = {} # acc = 0 # for key in val.items(): # pi = key[1] / N # acc += np.power(pi,2) # return 1 - acc G0 = entropy(array_source) N = array_source.size # print(N) # print(G0) q = len(array_children_list) iG = np.zeros((q, 1)) acc = 0 iG[0] = entropy(array_children_list[0]) # print(array_children_list[1].size / N) iG[1] = entropy(array_children_list[1]) S = (array_children_list[0].size / N) * iG[0] + (array_children_list[1].size / N) * iG[1] # for i in range(q): # n = array_children_list[i].size # acc += (n/N) * entropy(array_children_list[i]) # print() return float(G0 - S)
def plot_waverec(curve, entonema, wav, wavelet, directory): levels = 4 coeff = get_cD(curve, wavelet, level=levels) fig = plt.figure() ax0 = fig.add_subplot(levels + 2, 1, 1) ax0.plot(curve) plt.yticks([min(curve), (min(curve) + max(curve)) / 2, max(curve)], size=6) plt.xticks(np.arange(0, len(curve), step=len(curve) / 10), size=6) cv = convolve(curve, curve, mode='valid')[0] ax0.set_xlabel('longitud = {} entropía = {} convolución = 10**{}'.format( len(curve), entropy(curve), round(np.log10(cv), 2)), size=8) ax0.set_ylabel("Curva original", size=8) ax0.grid() for i, (cA, cD) in enumerate(coeff): rec = pywt.waverec((cA, cD), 'db5') ax0 = fig.add_subplot(levels + 2, 1, i + 2) ax0.plot(rec, alpha=0.6) ax0.grid() plt.yticks([min(rec), (min(rec) + max(rec)) / 2, max(rec)], size=6) plt.xticks(np.arange(0, len(rec), step=len(rec) / 10), size=6) cv = convolve(curve, rec, mode='valid')[0] ax0.set_xlabel( 'longitud = {} entropía = {} convolución = 10**{}'.format( len(rec), entropy(rec), round(np.log10(cv), 2)), size=8) ax0.set_ylabel("N{}".format(i), size=8) fig.subplots_adjust(hspace=1.2, wspace=0.2) plt.savefig('{}/{}_{}_{}.png'.format(directory, entonema, wav.replace('.wav', ''), wavelet))
def plot_spline_interpolation(dataset, entonema, wav): print('Hola!') if not wav.endswith('.wav'): return curve_path = '{}/{}/{}'.format(dataset, entonema, wav) curve = get_pitch_decompy_values(curve_path, remove_silencess=True, interpolate=False) ynew = spline_interpolation(curve) x = np.arange(0, len(curve)) plt.figure() plt.scatter(x, curve, s=np.pi, label='Original_pitch entropy = {}'.format(entropy(curve))) plt.plot(x, ynew, label='Spline_interpolation entropy = {}'.format(entropy(ynew)), alpha=0.4) plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.legend(loc='upper center', fancybox=True, shadow=True, bbox_to_anchor=(0.5, 1.13)) plt.grid() #plt.title('Cubic-spline interpolation\n\n entonema = {} wav = {}'.format(entonema, wav)) #plt.show() dir = './temp/{}'.format(entonema) if not os.path.exists(dir): os.mkdir(dir) plt.savefig('{}/{}_{}.png'.format(dir, entonema, wav.replace('.wav', '')))
def plot_before_after_transformation(dataset, entonema, wav): if not wav.endswith('.wav'): return path = '{}/{}/{}'.format(dataset, entonema, wav) ybefore = get_pitch_decompy_values(path, remove_silencess=False, interpolate=False) ynew = get_pitch_decompy_values(path, remove_silencess=True, interpolate=True) fig, axarr = plt.subplots(2) axarr[0].grid() axarr[0].plot(ybefore) axarr[0].set_ylabel('No trans') axarr[0].set_xlabel('len = {} entropy = {}'.format( len(ybefore), entropy(ybefore))) ax = plt.axis axarr[1].grid() axarr[1].plot(ynew) axarr[1].set_ylabel('With trans') axarr[1].set_xlabel('len = {} entropy = {}'.format( len(ynew), entropy(ynew))) fig.subplots_adjust(hspace=0.5) plt.axis = ax dir = './temp/{}'.format(entonema) if not os.path.exists(dir): os.mkdir(dir) plt.savefig('{}/{}_{}.png'.format(dir, entonema, wav.replace('.wav', '')))
def main(): infilename = 'test.txt' outfilename = 'decoded_text.txt' encode(infilename) decode('encoded_text.txt', outfilename) if filecmp.cmp(infilename, outfilename): print("Good job! Go have a rest.") else: print("Go work!") return x = open('encoded_text.txt', 'rb') encoded_file_length = len(list(x.read())) x.close() x = open(infilename, 'rb') file_length = len(list(x.read())) x.close() print("Dlugosc kodowanego pliku: ", file_length) print("Dlugosc uzyskanego kodu: ", encoded_file_length) compression_deg = os.stat(infilename).st_size / os.stat( 'encoded_text.txt').st_size print("Stopien kompresji: ", compression_deg) print("\nEntropia pliku kodowanego:") entropy.entropy(infilename) print("\nEntropia uzyskanego kodu:") entropy.entropy('encoded_text.txt')
def MDLPC_criterion(self, data, feature, cut_point): ''' Determines whether a partition is accepted according to the MDLPC criterion :param feature: feature of interest :param cut_point: proposed cut_point :param partition_index: index of the sample (dataframe partition) in the interval of interest :return: True/False, whether to accept the partition ''' #get dataframe only with desired attribute and class columns, and split by cut_point data_partition = data.copy(deep=True) data_left = data_partition[data_partition[feature] <= cut_point] data_right = data_partition[data_partition[feature] > cut_point] #compute information gain obtained when splitting data at cut_point cut_point_gain = cut_point_information_gain(dataset=data_partition, cut_point=cut_point, feature_label=feature, class_label=self._class_name) #compute delta term in MDLPC criterion N = len(data_partition) # number of examples in current partition partition_entropy = entropy(data_partition[self._class_name]) k = len(data_partition[self._class_name].unique()) k_left = len(data_left[self._class_name].unique()) k_right = len(data_right[self._class_name].unique()) entropy_left = entropy(data_left[self._class_name]) # entropy of partition entropy_right = entropy(data_right[self._class_name]) delta = log(3 ** k, 2) - (k * partition_entropy) + (k_left * entropy_left) + (k_right * entropy_right) #to split or not to split gain_threshold = (log(N - 1, 2) + delta) / N if cut_point_gain > gain_threshold: return True else: return False
def test_args_out_of_range(self): """ Edge tst to make sure the function throws a ValueError when the input probabilities are < 0 or > 1. """ with self.assertRaises(ValueError): entropy([-1, 2]) return
def test_args_dont_sum_to_1(self): """ Edge test to make sure the function throws a ValueError when the input probabilities do not sum to one. """ with self.assertRaises(ValueError): entropy([.9, .9]) return
def main(): args = list(sys.argv) if len(args) == 1: args.append('omega') infilename = 'test.txt' outfilename = 'decoded_text.txt' code = encode(infilename) encoding_error = choose_encoding(args[1], code) if encoding_error: return # create dictionary for decode function message_file = open(infilename, 'rb') message = list(message_file.read()) file_length = len(message) message_file.close() dictio = {} next_index = 1 for c in message: if not [str(c)] in dictio.values(): dictio[next_index] = [str(c)] next_index += 1 # read encoded text and parse it to 0-1 values encoding_file = open('encoded_text.txt', 'rb') content = list(encoding_file.read()) encoded_file_length = len(content) bits = '' for byte in content[:-1]: bits += format(byte, '08b') # delete redundant zeros from the end of encoded 0-1 string redundant_zeros_num = content[-1] if redundant_zeros_num > 0: bits = bits[:-content[-1]] # decode message choose_decoding(args[1], bits, dictio, outfilename) if filecmp.cmp(infilename, outfilename): print("Good job! Go have a rest.") else: print("Go work!") return print("Dlugosc kodowanego pliku: ", file_length) print("Dlugosc uzyskanego kodu: ", encoded_file_length) compression_deg = os.stat(infilename).st_size / os.stat( 'encoded_text.txt').st_size print("Stopien kompresji: ", compression_deg) print("\nEntropia pliku kodowanego:") entropy.entropy(infilename) print("\nEntropia uzyskanego kodu:") entropy.entropy('encoded_text.txt')
def mi(lines, vocab): d1 = Counter() d2 = Counter() djoint = Counter() for w1, w2, c in mi_contributions(lines): if (not vocab) or (w1 in vocab and w2 in vocab): d1[w1] += c d2[w2] += c djoint[w1, w2] += c return (entropy.entropy(d1.values()) + entropy.entropy(d2.values()) - entropy.entropy(djoint.values()))
def test(): """ brute force an answer """ from random import randrange for a in sorted(range(123), key=lambda u: random()): for b in sorted(range(123), key=lambda u: random()): for c in sorted(range(1, 123), key=lambda u: random()): if entropy(mapper(a, b, c)) > DIFFICULTY: print(a, b, c) print(mapper(a, b, c)) print(entropy(mapper(a, b, c))) break print(a)
def main(): args = process_args() data = args.file.read() args.file.close() results = list() if args.md5 or args.all: results.append(('md5',hashes.hash('md5', data))) if args.sha1 or args.all: results.append(('sha1', hashes.hash('sha1', data))) if args.sha256 or args.all: results.append(('sha256', hashes.hash('sha256', data))) if args.sha512 or args.all: results.append(('sha512', hashes.hash('sha512', data))) if args.entropy or args.all: results.append(('entropy', entropy.entropy(data))) if args.magic or args.all: results.append(('magic', filemagic.filemagic(data))) output(args.output, results) return 0
def runcontingent(path): from entropy import entropy import toolshed as ts it = ts.reader(path) iterable = (Interval(**iv) for iv in it) values = defaultdict(list) genes = set() by_transcript = defaultdict(list) by_domain = defaultdict(list) for iv in iterable: by_domain[iv.domain].append(iv) by_transcript[iv.transcript].append(iv) for domain, ivs in by_domain.items(): if len(ivs) < 2: continue if sum(iv.mafs.count(',') for iv in ivs) < 3: continue if domain == ".": continue intervals = ivs[:] for iv in ivs: intervals.extend(by_transcript[iv.transcript]) intervals = set(intervals) if len(intervals) < 3: continue pval, ratio, tbl, gene = contingent(intervals, domain, nodoms_only=False) ent = entropy(intervals) values['domain'].append(domain) values['pval'].append(pval) values['ent'].append(ent) values['tbl'].append(tbl) values['ratio'].append(ratio) values['num_intervals'].append(len(intervals)) values['num_domains'].append(len(ivs)) [genes.add(x) for x in gene] values['genes'].append(",".join(genes)) genes=set() return values['domain'],values['pval'],values['ent'],values['tbl'],values['ratio'],values['num_intervals'],values['num_domains'],values['genes']
def main(): args = process_args() data = args.file.read() args.file.close() results = list() if args.md5 or args.all: results.append(('md5', hashes.hash('md5', data))) if args.sha1 or args.all: results.append(('sha1', hashes.hash('sha1', data))) if args.sha256 or args.all: results.append(('sha256', hashes.hash('sha256', data))) if args.sha512 or args.all: results.append(('sha512', hashes.hash('sha512', data))) if args.entropy or args.all: results.append(('entropy', entropy.entropy(data))) if args.magic or args.all: results.append(('magic', filemagic.filemagic(data))) output(args.output, results) return 0
def test_four_equal_likelihood_states(self): """ One shot test using the known case of four states with equal likelihood of occurrence. Should return 2 bits. """ assert np.isclose(entropy([0.25, 0.25, 0.25, 0.25]), 2.) return
def sess_001(self): cvalues = defaultdict(list) reports = [] cookies = self.__get_cookies(30) for c in cookies: for cname, cval in c.items(): assert (cname == cval['name']) cvalues[cname].append(cval['value']) for cname, cvals in cvalues.items(): ent = entropy(cvals) self.printer.aprint(str(ent)) if (ent < 20): reports.append( create_report( "sess_001", basic_description="Low entropy session cookie: {}". format(cname), confidence=1.0, severity="medium", owasp_association="2", cwe=565, misc=[ "Entropy heuristic only detects {} bits of entropy" .format(ent) ])) return reports
def plot_descomposition(curve, entonema, wav, wavelet, directory): wavelet_name = wavelet if type(wavelet) == type('hola') else wavelet.name size = len(curve.data) cA, cD = pywt.dwt(curve, wavelet, mode='symmetric', axis=-1) fig = plt.figure() for i in np.arange(0, max_level): ax0 = fig.add_subplot(max_level, 2, 2 * i + 1) ax0.grid() #ax0.plot(xa, cA, '-o') ax0.plot(cA, alpha=0.5, label='cA') plt.yticks([min(cA), (min(cA) + max(cA)) / 2, max(cA)], size=6, rotation=20) plt.xticks(np.arange(0, len(cA), step=len(cA) / 4), size=6) ax0.set_xlabel('len = {} entropy = {}'.format(len(cA), entropy(cA)), size=8) ax0.set_ylabel("L{}".format(i), size=8) #plt.setp(ax0.get_xticklabels(), fontsize=6) ax1 = fig.add_subplot(max_level, 2, 2 * i + 2) ax1.grid() #ax1.plot(xd, cD, '-o') ax1.plot(cD, label='cD') #plt.yticks(np.arange(min(cD),max(cD), step=(max(cD) - min(cD))/2), size = 6) plt.yticks([min(cD), (min(cD) + max(cD)) / 2, max(cD)], size=6, rotation=20) plt.xticks(np.arange(0, len(cD), step=len(cD) / 4), size=6) ax1.set_xlabel('len = {} entropy = {}'.format(len(cD), entropy(cD)), size=8) if i == 0: ax0.set_title('cA', size=12) ax1.set_title('cD', size=12) cA, cD = pywt.dwt(cA, wavelet, mode='symmetric', axis=-1) #xa = [t for t in np.arange(0, len(cA)*np.pi/50, step=np.pi/50)] #xd = [t for t in np.arange(0, len(cD)*np.pi/50, step=np.pi/50)] fig.subplots_adjust(hspace=1.2, wspace=0.2) plt.savefig('{}/{}_{}_{}.png'.format(directory, entonema, wav.replace('.wav', ''), wavelet_name))
def getmalwaresignature(input_malware): if os.path.isdir(input_malware): malwares_files = os.listdir(input_malware) for malware in malwares_files: malware_file = os.path.join(input_malware, malware) _malwaresignature(malware_file, malware) entropy(malware_file) continue return malware_file else: malware_file = input_malware malware = os.path.basename(malware_file) ISO8601, hashmethod, arch, importeddlls, imphash, fuzzyhash = _malwaresignature(malware_file,malware) #formattedpdf(input_malware, malware_file,ISO8601, hashmethod, arch, importeddlls, imphash, fuzzyhash) #formattedtext(input_malware, malware_file, ISO8601, hashmethod, arch, importeddlls, imphash, fuzzyhash) entropy(input_malware)
def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 baseEntropy = entropy(dataSet) bestInfoGain = 0.0; bestFeature = -1 for i in xrange(numFeatures): InfoGain=baseEntropy #x=getColumn(dataSet,i) x= [sample[i] for sample in dataSet] for k in set(x): subDataSet = splitDataSet(dataSet, i, k) xProb=float(len(subDataSet)/len(dataSet)) xEntropy=entropy(subDataSet) InfoGain-=xProb*xEntropy print i,k,xProb,xEntropy if InfoGain>bestInfoGain: bestInfoGain=InfoGain;bestFeature=i return bestInfoGain,bestFeature
def entropy2(p): n = len(p) h = np.zeros(n) for i in range(n): p1 = np.hstack((p[i], 1 - p[i])) h[i] = entropy(p1) return h
def info(self, dataset, entonema, wav, curve = []): wav_path = '{}/{}/{}'.format(dataset,entonema,wav) if len(curve) == 0: curve = get_pitch_decompy_values(wav_path, \ remove_silencess = self.silences, interpolate = self.interpolate) entr_pitch = [round(entropy(curve), ndigits=2)] cA_entr = [] cD_entr = [] conv = [] descp = get_cD(curve, 'db5', level = 4) for (cA, cD) in descp: cA_entr.append(round(entropy(cA), ndigits=2)) cD_entr.append(round(entropy(cD), ndigits=2)) rec = pywt.waverec((cA,cD), 'db5') cv = convolve(curve,rec, mode='valid')[0] conv.append(round(cv,ndigits=2)) return entr_pitch + cA_entr + cD_entr + conv
def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 baseEntropy = entropy(dataSet) bestInfoGain = 0.0 bestFeature = -1 for i in xrange(numFeatures): InfoGain = baseEntropy #x=getColumn(dataSet,i) x = [sample[i] for sample in dataSet] for k in set(x): subDataSet = splitDataSet(dataSet, i, k) xProb = float(len(subDataSet) / len(dataSet)) xEntropy = entropy(subDataSet) InfoGain -= xProb * xEntropy print i, k, xProb, xEntropy if InfoGain > bestInfoGain: bestInfoGain = InfoGain bestFeature = i return bestInfoGain, bestFeature
def getent(K,rho): rundata=setrun.setrun() rundata.probdata.K_B=K rundata.probdata.rho_B=rho print K,rho rundata.write() runclaw(outdir='./_output') ent=entropy(rundata.clawdata.nout) return ent[-1]
def get_results(Path): print("calculating entropy of samples...") # Calculate Entropy: list_of_entropies = entropy(Path) print("calculating sizes of samples...") # Calculate file sizes: list_of_sizes = file_size(Path)[0] list_of_files = file_size(Path)[1] return list_of_sizes, list_of_entropies, list_of_files
def getent(K, rho): rundata = setrun.setrun() rundata.probdata.K_B = K rundata.probdata.rho_B = rho print K, rho rundata.write() runclaw(outdir='./_output') ent = entropy(rundata.clawdata.nout) return ent[-1]
def reporter(image_file=IMAGE_FILE, neighborhood=NEIGHBORHOOD, scale=SCALE, dpi=DPI, channel=CHANNEL, plane=PLANE, scope=SCOPE): """ Test usage: reporter('image.tiff') reporter('image.tiff', 8, 5, 1000) reporter('image.tiff', 8, 5, 500, 'R', 0, 5) """ bp(image_file, dpi, channel, plane) hist(image_file, scale, dpi) atc(image_file, dpi) ngbd(image_file, neighborhood, dpi) entropy(image_file)
def main(): parser = get_argparser() args = parser.parse_args() if not args.usetarget: trainingdata.STOPWORDS = trainingdata.load_stopwords(args.bitextfn) triple_sentences = trainingdata.load_bitext(args.bitextfn, args.alignfn) if args.usetarget: ## Flip directionality -- we want the top words out of the target text. new_triple_sentences = [(t, s, a) for (s, t, a) in triple_sentences] triple_sentences = new_triple_sentences sl_sentences = [s for (s,t,a) in triple_sentences] top_words = trainingdata.get_top_words(sl_sentences) with open("topwords.txt", "w") as topwordsout: for (i, (word, count)) in enumerate(top_words): print("{0} & {1} & {2} \\\\".format(1+i, word, count), file=topwordsout) if args.usetarget: ## Bail out -- just getting target text top words. return tl_sentences = trainingdata.get_target_language_sentences(triple_sentences) tagged_sentences = [list(zip(ss, ts)) for ss,ts in zip(sl_sentences, tl_sentences)] trainingdata.set_examples(sl_sentences, tagged_sentences) source_annotated = annotated_corpus.load_corpus(args.annotatedfn) trainingdata.set_sl_annotated(source_annotated) stamp = util.timestamp() langs = args.bitextfn.split(".")[1] translations_fn = "results/{0}-{1}-translations".format(stamp, langs) entropy_fn = "results/{0}-{1}-entropy".format(stamp, langs) with open(translations_fn, "w") as topwordsout, \ open(entropy_fn, "w") as entropyout: for (i, (word, count)) in enumerate(top_words): training = trainingdata.trainingdata_for(word, nonnull=False) labels = [label for (feat,label) in training] counts = Counter(labels) translations_l = [] for label, count in counts.most_common(5): if label == UNTRANSLATED: label = "NULL" translations_l.append("{0}".format(label)) translations = ", ".join(translations_l) print("{0} & {1}".format(word, translations), file=topwordsout) bits = entropy(labels) print("%30s%30.2f" % (word, bits), file=entropyout)
def test_02(): x = np.array([0.5, 0.5]) norm = np.sum(x) assert np.isclose(norm, 1.0) eVal = entropy(x) assert np.isclose(eVal, np.log(len(x))) return None
def dataReceived(self, data): try: a, b, c = data.decode('utf-8').split(",") a = int(a) b = int(b) c = int(c) except: self.transport.write(b"Bad format, try again.\n") return if entropy(mapper(a, b, c)) > DIFFICULTY: self.transport.write(b"Congrats! " + FLAG + b"\n") else: self.transport.write(b"Nope.\n")
def cond_mutual_information(x,y,z,nbins=20): Hxz = joint_entropy(x,z,nbins) Hyz = joint_entropy(y,z,nbins) Hz = entropy(z,nbins) # Mutual Information for 3 Variables count_xyz,edges = np.histogramdd(np.array([x,y,z]).T, bins=20) p_xyz = count_xyz/len(x) Hxyz = -np.sum(np.sum(np.sum(np.where(p_xyz>0, np.log2(p_xyz)*p_xyz,0)))) # Conditional Mutual Information MIxy_z = Hxz + Hyz - Hxyz - Hz NMIxy_z = 2*MIxy_z/(Hxz + Hyz) return MIxy_z, NMIxy_z
def set_Y_entropy(data): for r in ['trn','val','tst']: Y = data[r + '_Y'] if data['err'] == 'ce': v = entropy.entropy(Y.mean(axis=0)) elif data['err'] == 'mse': #hist = np.histogram(Y, bins=100) #v = scipy.stats.rv_histogram(hist).entropy() if Y.shape[1] != 1: # only 1-d continuous output supported right now raise Exception() v = entropy.gaussian_entropy_np(1, np.var(Y)) else: raise Exception('Unknown error func') data[r+'_entropyY'] = v return data
def run(filename): data = get_data(filename).lower() data = re.sub(r'[!?.:;,-]', '', data) lines = data.split("\n") string = " ".join(lines) words_ = string.split(" ") words = list(filter(lambda x: x != '', words_)) frequencies = FT.frequency_table(words) frequency_items = frequencies.items() entropy_of_text = entropy(frequencies) print 'lines: ' + str(len(lines)) print 'words: ' + str(len(words)) print 'chars: ' + str(len(data)) print "entropy: " + str(round_off(entropy_of_text, 2))
def computeIGCI(F, debug): ### %% Discretize the fluorescence signal D = discretizeFluorescenceSignal(F) ### %% Compute the entropy H = entropy(D) ##### %% Compute the scores as entropy differences (vectorized :-)) n = len(H) scores = numpy.zeros(shape=(n, n)) #scores = numpy.vstack([scores, H.T[0]]) for i in range(n): scores[i] = H.T[0] scores = scores - scores.T #print scores return scores
def computeIGCI(F, debug): ### %% Discretize the fluorescence signal D = discretizeFluorescenceSignal(F) ### %% Compute the entropy H = entropy(D) ##### %% Compute the scores as entropy differences (vectorized :-)) n = len(H) scores = numpy.zeros(shape=(n,n)) #scores = numpy.vstack([scores, H.T[0]]) for i in range(n): scores[i] = H.T[0] scores = scores - scores.T #print scores return scores
import entropy import time import numpy as np method_1 = [] stat = entropy.EntropyStringMatching() for i in xrange(0, 10000): start = time.time() #stat = entropy.EntropyStringMatching() stat.run("Data Analyst", 'Fancy Data Analyst') end = time.time() diff = (end - start) * 1000 method_1.append(diff) print "Method 1 returns an average of %s " % np.average(method_1) method_2 = [] for i in xrange(0, 10000): start = time.time() calc = entropy.joint_entropy("Data Analyst", 'fancy data analyst') / entropy.entropy("Data Analyst") end = time.time() diff = (end - start) * 1000 method_2.append(diff) print "Method 2 returns an average of %s " % np.average(method_2)
def test_change_entropy( self ): F = numpy.array([[1, 2], [4, 8]]) R = numpy.array([[1], [1]]) assert (entropy.entropy(F) == R).all()
def test_negative_entropy( self ): F = numpy.array([[-10, -20], [-20, -40]]) R = numpy.array([[1], [1]]) assert (entropy.entropy(F) == R).all()
def test_no_entropy( self ): F = numpy.array([[2, 2], [2, 2]]) R = numpy.array([[0], [0]]) assert (entropy.entropy(F) == R).all()