def intersect(self, nums1, nums2): c3 = ct(nums1)&ct(nums2) print(c3, ct(nums1),ct(nums2)) ret = [] for ch in c3: ret += ([ch]*c3[ch]) return ret
def shortestCompletingWord(self, licensePlate, words): ans = None license = ct(i.lower() for i in licensePlate if i.isalpha()) for i in words: tmp = ct(i) if all(k in tmp and tmp[k] >= v for k, v in license.items()): ans = ans if ans and len(i) >= len(ans) else i return ans
def isAnagram(self, s, t): """ :type s: str :type t: str :rtype: bool """ from collections import Counter as ct sct = ct(s) tct = ct(t) return sct == tct
def create_3K_lic_rpt(input_file, output_file, smart_account): '''Creates a CSV formatted Report of 3K licensing content from a file input of a CCW-R file export''' with open(input_file, 'r') as f: rl = f.readlines() ### Find CCW-R header row to place into a list header = [i for i in rl if ldos(i)] ###Parse CCW-R lines with any 3x50 SKUs into a list of rows dev_3x50 = [i for i in rl if is_3x50(i)] ###Parse CCW-R lines for traditional top-level SKU rows non_C1_dev = [i for i in dev_3x50 if non_C1_3x50(i)] ###Parse CCW-R lines for individual on-box SW upgrade licensing rows upg_lics = [i for i in dev_3x50 if lic_C1_3x50(i)] ###Parse C1 SKUs for 3Ks less than 12 ports b/c SW licenses appear in top-level non_24_48_port = [i for i in dev_3x50 if non_24_48_port_C1(i)] ###Concatenate all parsed lists parsed_ccwr_rows_list = header + non_C1_dev + upg_lics + non_24_48_port ###Perform count of elements in concatenated list and place in dict devdict = dict(ct([(i.split(','))[0] for i in parsed_ccwr_rows_list][1:])) ###Extract top-level SKUs and convert to list of actual licensing SKU that appear in CSSM. C3x50 = [((i.split(','))[0][3:11] + '-' + (i.split(','))[0][-1]) for i in parsed_ccwr_rows_list if i.split(',')[0].startswith('WS-C3')] C3x50 = C3x50 + [((i.split(','))[0][:12].replace( (i.split(','))[0][:5], 'C') + '-' + (i.split(','))[0][-1]) for i in parsed_ccwr_rows_list if i.split(',')[0].startswith('C1-WS')] C3x50_E = [i.replace(i[-2:], '-S-E') for i in C3x50 if i.endswith('E')] C3x50_S = [i.replace(i[-2:], '-L-S') for i in C3x50 if i.endswith('S')] ###Extract top-level upgrade license SKUs and convert to list upg_lics_indiv = [i.split(',')[0] for i in upg_lics] ###Concatenate license lists total_upg_lics = C3x50_E + C3x50_S + upg_lics_indiv ###Perform count of elements in concatenated list and place in dict licdict = dict(ct(total_upg_lics)) ###Create output file with open(output_file, 'w') as f: f.write('Top-Level Device OR License,-----,Count\n') for i in devdict: f.write(i + ',-----,' + str(devdict[i]) + '\n') f.write(4 * '\n') f.write("LICENSES to be deposited in %s\n\n" % smart_account + 'License,-----,Count\n') for i in licdict: f.write(i + ',-----,' + str(licdict[i]) + '\n') f.write(4 * '\n') f.write("Full License/Device Breakout from CCW-R\n\n") for i in parsed_ccwr_rows_list: f.write(i)
def meu_knn(dados_train, rotulo_train, dados_teste, k, normalizar): rotulos = [] if normalizar: # aplica normalização aos dados dados_teste = normalizacao(dados_teste) dados_train = normalizacao(dados_train) for i in range(len(dados_teste)): distancia_teste_train = [] for j in range(len(dados_train)): # calcula a distância entre o elemento i de teste e o elemento j de treinamento distancia_teste_train.append(dist(dados_teste[i], dados_train[j], len(dados_teste[i]))) # ordena a lista de distância e rótulos juntos para que fiquem com os mesmos índices nos # elementos correspondentes; então, seleciona a lista com os rótulos já ordenados rotulos_ordenados = [y for x, y in sorted(zip(distancia_teste_train, rotulo_train))] # salva somente a coluna com os rótulos, que é o valor interessante para o problema rotulos_ordenados = [row[0] for row in rotulos_ordenados] # seleciona os k primeiros elementos dos rótulos, calcula a quantidade de ocorrências # para cada valor e pega o valor mais comum encontrado rotulos.append(ct(rotulos_ordenados[:k]).most_common(1)[0][0]) return rotulos
def removeDuplicates(data: list) -> list: output = data[::-1] count = ct(data) for i in [k for k, v in count.items() if v > 2]: for _ in range(count[i] - 2): # 需删除的次数 output.remove(i) print(output[::-1]) return output[::-1]
def topKFrequent1(self, nums, k): """ :type nums: List[int] :type k: int :rtype: List[int] """ from collections import Counter as ct return [k for (k,v) in ct(nums).most_common(k)]
def findLucky(self, arr: List[int]) -> int: C = ct(arr) ret = -1 for c in C : if c == C[c] : ret = max(ret, c) return ret
def top_publishers(publishers, top): top_ten = set(("")) top_ten = dict(ct(publishers).most_common(top)) #for publisher, count in publishers.most_common(): print(top_ten) print(len(top_ten)) return top_ten
def top_Platforms(platforms, top): top_platforms = set(("")) top_platforms = dict(ct(platforms).most_common(top)) #for publisher, count in publishers.most_common(): print(top_platforms) print(len(top_platforms)) return top_platforms
def top_publishers(publishers, top): #deletes publishers with less than min number top_ten = set(("")) top_ten = dict(ct(publishers).most_common(top)) #for publisher, count in publishers.most_common(): print(top_ten) print(len(top_ten)) return top_ten
def customSortString(self, S, T): """ :type S: str :type T: str :rtype: str """ t = set(T) t2 = set(S) from collections import Counter as ct c = ct(T) s = [char * c[char] for char in S if char in t] add = [char * c[char] for char in t - t2] return "".join(s + add)
def f(l): #print l if len(l) == 1: return l[0] m = min(x[0] for x in l) two = filter(lambda x: x[0] == m, l) #print two,m if len(two) == 2: rest = filter(lambda x: x[0] != m, l) c = ct(two[0][1:] + two[1][1:]) c -= ct([x[0] for x in rest]) assert len(c) == 1 return list(c.elements()) + f(sorted([x[1:] for x in rest])) else: m = max(x[-1] for x in l) two = filter(lambda x: x[-1] == m, l) rest = filter(lambda x: x[-1] != m, l) assert len(two) == 2 c = ct(two[-1][:-1] + two[-2][:-1]) c -= ct([x[-1] for x in rest]) assert len(c) == 1 return f(sorted([x[:-1] for x in rest])) + list(c.elements())
def select_example(self): sub_pred = dd(list) #Mn predicted labels for each cluster idx = 0 for k,v in self.ex_id.items(): sub_pred[k] = self.clf.predict(self.fn[v]) #predict labels for cluster learning set #entropy-based cluster selection rank = [] for k,v in sub_pred.items(): count = list(ct(v).values()) count[:] = [i/float(max(count)) for i in count] H = np.sum(-p*math.log(p,2) for p in count if p!=0) rank.append([k,len(v),H]) rank = sorted(rank, key=lambda x: x[-1], reverse=True) if not rank: raise ValueError('no clusters found in this iteration!') c_idx = rank[0][0] #pick the 1st cluster on the rank, ordered by label entropy c_ex_id = self.ex_id[c_idx] #examples in the cluster picked sub_label = sub_pred[c_idx] #used when choosing cluster by H sub_fn = self.fn[c_ex_id] #sub-cluster the cluster c_ = KMeans(init='k-means++', n_clusters=len(np.unique(sub_label)), n_init=10) c_.fit(sub_fn) dist = np.sort(c_.transform(sub_fn)) ex_ = dd(list) for i,j,k,l in zip(c_.labels_, c_ex_id, dist, sub_label): ex_[i].append([j,l,k[0]]) for i,j in ex_.items(): #sort by ex. dist to the centroid for each C ex_[i] = sorted(j, key=lambda x: x[-1]) for k,v in ex_.items(): if v[0][0] not in self.labeled_set: #find the first unlabeled ex idx = v[0][0] break return idx, c_idx
def topKFrequent(nums, k): from collections import Counter as ct return [num for (num, count) in ct(nums).most_common(k)]
z = 0 for i in teslaSummaryPOS: teslaSummaryTags.append(teslaSummaryPOS[x][1]) x += 1 for i in NYTimesPOS: NYTimesTags.append(NYTimesPOS[y][1]) y += 1 for i in ESPNPOS: ESPNTags.append(ESPNPOS[z][1]) z +=1 # In[92]: #count the tags teslaSumTagCount = ct(teslaSummaryTags) NYTimesTagCount = ct(NYTimesTags) ESPNTagCount = ct(ESPNTags) # In[93]: #sublinear normalization for key in teslaSumTagCount: x = teslaSumTagCount[key] if x > 0: teslaSubLinearNorm[key] = 1 + math.log(x) for key in NYTimesTagCount: x = NYTimesTagCount[key] if x > 0: NYTimesSubLinearNorm[key] = 1 + math.log(x)
def count_publisher(json_data): publishers = ct(k['publisher'] for k in json_data if k.get('publisher')) # for publisher, count in publishers.most_common(): # print(publisher, count) return publishers
else: print '# of p label', len(p_label) print 'p label acc', sum(label[p_idx]==p_label)/float(len(p_label)) #print 'ex before 30 itr', ct(ex_30) #print 'ex after 50 itr', ct(ex_50) #print 'ex all', ct(ex_all) cm_cls = np.unique(np.hstack((test_label,preds))) f = open('al_out','w') f.writelines('%s;\n'%repr(i) for i in tp_type) f.write('ex in each itr:'+repr(ex)+'\n') f.write(repr(cm_cls)) f.close() print 'f count on all ex', ct(label) ave_acc = [np.mean(acc) for acc in acc_sum] acc_std = [np.std(acc) for acc in acc_sum] ''' ave_acc_type = [[] for i in range(6)] ave_pre = [[] for i in range(6)] ave_rec = [[] for i in range(6)] for i in range(6): ave_acc_type[i] = [np.mean(a) for a in acc_type[i]] ave_pre[i] = [np.mean(p) for p in precision_type[i]] ave_rec[i] = [np.mean(r) for r in recall_type[i] ] ''' print 'overall acc:', repr(ave_acc) print 'p1',p1 print np.mean(p1)
from collections import Counter as ct l=["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"] g=map(str,range(0,10)) def f(a,b): tp=ct() for i in b.keys(): tp[i]=a*b[i] return tp #l=map(lambda x:ct(x),l) for i in range(int(raw_input())): c=ct(raw_input()) d=ct() ans=[0]*10 ans[0]=c['Z'] ans[2]=c['W'] ans[8]=c['G'] ans[6]=c['X'] ans[4]=c['U'] ans[3]=c['H']-ans[8] ans[5]=c['F']-ans[4] ans[7]=c['V']-ans[5] ans[1]=c['O']-ans[2]-ans[4]-ans[0] ans[9]=(c['N']-ans[1]-ans[7])/2 #print ans assert(len(filter(lambda a:a>=0,ans))==10) assert(ct("".join(map(lambda a,b: a*b,ans,l)))==c) print "Case #{}: {}".format(str(i+1),"".join(map(lambda a,b:a*b,ans,g)))
label1 = input2[:, -1] label = input4[:, -1] label1 = input6[:, -1] # input3 = input3 #quick run of the code using other building # input3, label = shuffle(input3, label) name = [] for i in input3: s = re.findall("(?i)[a-z]{2,}", i) name.append(" ".join(s)) cv = CV(analyzer="char_wb", ngram_range=(3, 4)) # tv = TV(analyzer='char_wb', ngram_range=(3,4)) fn = cv.fit_transform(name).toarray() # fn = cv.fit_transform(input1).toarray() # print cv.vocabulary_ print "class count of true labels of all ex:\n", ct(label) # n_class = len(np.unique(label)) # print n_class # print np.unique(label) # print 'class count from groud truth labels:\n',ct(label) fold = 10 rounds = 100 # clf = LR() # clf = LinearSVC() # clf = SVC(kernel='linear', probability=True) clf = RFC(n_estimators=100, criterion="entropy") # kf = StratifiedKFold(label, n_folds=fold, shuffle=True) kf = KFold(len(label), n_folds=fold, shuffle=True) mapping = {
def create_3K_lic_rpt(ccwr_full_list, output_file, smart_account): '''Create a group on lambda functions to perform RegEx searches''' ###Find Header Row based on Serial keyword. hr = lambda s: search('.*[Ss][Ee][Rr][Ii][Aa].*', s) ###Find any SKU containing 3K nomenclature. is_3x50 = lambda s: search('3[68]50', s) ###Find any top level traditonal hardware SKU that also contains license level. non_C1_3x50 = lambda s: search('WS-C3[68]50.*-[SE]', s) ###Find individual 3K on-box license SKUs. lic_C1_3x50 = lambda s: search('C3[68]50-[24][48]-[SL]-[ES]', s) ###Find any C1 SKU that is less than 24 ports. These have license level as part of the top-level part. non_24_48_port_C1 = lambda s: search('C1-WS.*-12.*', s) '''Creates a CSV formatted Report of 3K licensing content from a file input of a CCW-R file export''' #print(dumps(ccwr_full_list[0:5],indent=4)) ### Find CCW-R header row to place into a list header = [i for i in ccwr_full_list[0:3] if hr(str(i))] ###Parse CCW-R lines with any 3x50 SKUs into a list of rows dev_3x50 = [i for i in ccwr_full_list if is_3x50(str(i))] ###Parse CCW-R lines for traditional top-level SKU rows non_C1_dev = [i for i in dev_3x50 if non_C1_3x50(str(i))] #print(dumps(non_C1_dev,indent=4)) ###Parse CCW-R lines for individual on-box SW upgrade licensing rows upg_lics = [i for i in dev_3x50 if lic_C1_3x50(str(i))] #print(dumps(upg_lics,indent=4)) ###Parse C1 SKUs for 3Ks less than 12 ports b/c SW licenses appear in top-level non_24_48_port = [i for i in dev_3x50 if non_24_48_port_C1(str(i))] #print(dumps(non_24_48_port,indent=4)) ###Concatenate all parsed lists parsed_ccwr_rows_list = header + non_C1_dev + upg_lics + non_24_48_port #print(dumps(parsed_ccwr_rows_list,indent=4)) ###Perform count of elements in concatenated list and place in dict devdict = dict(ct([i[0] for i in parsed_ccwr_rows_list][1:])) ###Extract top-level SKUs and convert to list of actual licensing SKU that appear in CSSM. C3x50 = [ i[0][3:11] + '-' + i[0][-1] for i in parsed_ccwr_rows_list if i[0].startswith('WS-C3') ] C3x50 = C3x50 + [ i[0][:12].replace(i[0][:5], 'C' + '-' + i[0][-1]) for i in parsed_ccwr_rows_list if i[0].startswith('C1-WS') ] C3x50_E = [i.replace(i[-2:], '-S-E') for i in C3x50 if i.endswith('E')] C3x50_S = [i.replace(i[-2:], '-L-S') for i in C3x50 if i.endswith('S')] ###Extract top-level upgrade license SKUs and convert to list upg_lics_indiv = [i[0] for i in upg_lics] ###Concatenate license lists total_upg_lics = C3x50_E + C3x50_S + upg_lics_indiv ###Perform count of elements in concatenated list and place in dict licdict = dict(ct(total_upg_lics)) ###Create output file with open(output_file, 'w') as f: f.write('Top-Level Device OR License,-----,Count\n') for i in devdict: f.write(i + ',-----,' + str(devdict[i]) + '\n') f.write(4 * '\n') f.write("LICENSES to be deposited in %s\n\n" % smart_account + 'License,-----,Count\n') for i in licdict: f.write(i + ',-----,' + str(licdict[i]) + '\n') f.write(4 * '\n') f.write("Full License/Device Breakout from CCW-R\n\n") for i in parsed_ccwr_rows_list: for j in i: f.write(j) f.write(',') f.write('\n')
featureLabelFile = "../../dataset/processed_acl/processedBooksKitchenElectronics/"+dataName featureMatrix, labelList = readFeatureLabel(featureLabelFile) # transferLabelFile = "../../dataset/processed_acl/processedBooksKitchenElectronics/transferLabel_books--electronics.txt" transferLabelFile = "../../dataset/processed_acl/processedBooksKitchenElectronics/transferLabel_kitchen--electronics.txt" auditorLabelList, transferLabelList, trueLabelList = readTransferLabel(transferLabelFile) featureMatrix = np.array(featureMatrix) labelArray = np.array(labelList) transferLabelArray = np.array(transferLabelList) print("number of types", len(set(labelArray))) print('class count of true labels of all ex:\n', ct(transferLabelArray)) initialExList = [[397, 1942, 200], [100, 1978, 657], [902, 788, 1370], [1688, 1676, 873], [1562, 1299, 617], [986, 1376, 562], [818, 501, 1922], [600, 1828, 1622], [1653, 920, 1606], [39, 1501, 166]] fold = 10 rounds = 150 multipleClassFlag = False al = _ProactiveLearning(fold, rounds, featureMatrix, labelArray, transferLabelArray, "sentiment_electronics", multipleClassFlag) al.setInitialExList(initialExList) al.run_CV() """
def run_CV(self, batchSize): cvIter = 0 totalInstanceNum = len(self.label) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] totalTransferNumList = [] random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum) foldInstanceList = [] for foldIndex in range(foldNum - 1): foldIndexInstanceList = indexList[foldIndex * foldInstanceNum:(foldIndex + 1) * foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):] foldInstanceList.append(foldIndexInstanceList) cvIter = 0 totalAccList = [[] for i in range(10)] for foldIndex in range(foldNum): # self.clf = LinearSVC(random_state=3) # self.clf = LR(fit_intercept=False) self.clf = LR(multi_class="multinomial", solver='lbfgs') train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex + 1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainNum = int(totalInstanceNum * 0.9) fn_test = self.fn[test] label_test = self.label[test] print("testing", ct(label_test)) fn_train = self.fn[train] initExList = [] initExList = self.pretrainSelectInit(train) # initExList = [316, 68, 495] # random.seed(110) # initExList = random.sample(train, 3) fn_init = self.fn[initExList] label_init = self.label[initExList] print("initExList\t", initExList, label_init) queryIter = 3 labeledExList = [] unlabeledExList = [] ###labeled index labeledExList.extend(initExList) unlabeledExList = list(set(train) - set(labeledExList)) while queryIter < rounds: fn_train_iter = [] label_train_iter = [] fn_train_iter = self.fn[labeledExList] label_train_iter = self.label[labeledExList] self.clf.fit(fn_train_iter, label_train_iter) idxList = self.select_example(unlabeledExList, batchSize) # print(queryIter, "idx", idx, self.label[idx]) # self.update_select_confidence_bound(idx) if len(idxList) > 0: labeledExList += idxList for idx in idxList: unlabeledExList.remove(idx) else: labeledExList.append(idxList) unlabeledExList.remove(idxList) acc = self.get_pred_acc(fn_test, label_test, labeledExList) totalAccList[cvIter].append(acc) queryIter += 1 cvIter += 1 totalACCFile = modelVersion + "_acc.txt" f = open(totalACCFile, "w") for i in range(10): totalAlNum = len(totalAccList[i]) for j in range(totalAlNum): f.write(str(totalAccList[i][j]) + "\t") f.write("\n") f.close()
def parentheses(s): c = ct(s) if c['('] == c[')']: return True return False
from sklearn import tree from sklearn.preprocessing import normalize input1 = [i.strip().split('+')[-1][:-5] for i in open('sdh_pt_rice').readlines()] input21 = np.genfromtxt('keti_hour_sum', delimiter=',') input22 = np.genfromtxt('sdh_hour_rice', delimiter=',') input2 = np.vstack((input21,input22)) #input2 = np.genfromtxt('sdh_45min_forrice', delimiter=',') input3 = [i.strip().split('\\')[-1][:-5] for i in open('rice_pt_sdh').readlines()] input4 = np.genfromtxt('rice_hour_sdh', delimiter=',') input5 = [i.strip().split('_')[-1][:-5] for i in open('soda_pt_new').readlines()] input6 = np.genfromtxt('soda_45min_new', delimiter=',') label1 = input2[:,-1] label = input4[:,-1] label1 = input6[:,-1] print 'class count of true labels of all ex:\n', ct(label) #input3 = input3 #quick run of the code using other building name = [] for i in input3: s = re.findall('(?i)[a-z]{2,}',i) name.append(' '.join(s)) cv = CV(analyzer='char_wb', ngram_range=(3,4)) #tv = TV(analyzer='char_wb', ngram_range=(3,4)) fn = cv.fit_transform(name).toarray() #fn = cv.fit_transform(input1).toarray() #print cv.vocabulary_ #fd = input4[:,[0,1,2,3,5,6,7]] #kmer = cv.get_feature_names() #idf = zip(kmer, cv._tfidf.idf_) #idf = sorted(idf, key=lambda x: x[-1], reverse=True)
def f(a,b): tp=ct() for i in b.keys(): tp[i]=a*b[i] return tp
featureMatrix.append(featureList) if line[lineLen-1] == "FALSE": label.append(0.0) else: # print(line[lineLen-1]) label.append(1.0) f.close() return featureMatrix, label if __name__ == "__main__": # featureLabelFile = "../../dataset/processed_acl/processedKitchenElectronics/"+dataName featureLabelFile = "../../dataset/processed_acl/processedBooksElectronics/"+dataName featureMatrix, labelList = readFeatureLabel(featureLabelFile) featureMatrix = np.array(featureMatrix) labelArray = np.array(labelList) print('class count of true labels of all ex:\n', ct(labelArray)) transferLabelFile = "../../dataset/processed_acl/processedBooksElectronics/transferLabel_books--electronics.txt" auditorLabelList, transferLabelList, targetLabelList = readTransferLabel(transferLabelFile) transferLabelArray = np.array(transferLabelList) fold = 10 rounds = 100 al = active_learning(fold, rounds, featureMatrix, transferLabelArray, labelArray) al.run_CV()
#print 'ct on traing label', ct(train_label) clf.fit(train_fn, train_label) sub_pred = dd(list) #Mn predicted labels for each cluster for k,v in ex_id.items(): sub_pred[k] = clf.predict(fn[v]) #predict labels for cluster learning set #acc_ = accuracy_score(label[train_], preds_c) #print 'acc on test set', acc #print 'acc on cluster set', acc_ #acc_sum[rr].append(acc) #print 'iteration', rr, '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' #the original H based cluster selection rank = [] for k,v in sub_pred.items(): count = ct(v).values() count[:] = [i/float(max(count)) for i in count] H = np.sum(-p*math.log(p,2) for p in count if p!=0) #H /= len(v)/float(len(train)) rank.append([k,len(v),H]) rank = sorted(rank, key=lambda x: x[-1], reverse=True) if not rank: break idx = rank[0][0] #pick the id of the 1st cluster on the rank cl_id.append(idx) #track cluster id on each iteration cc = idx #id of the cluster picked by H c_id = ex_id[cc] #example id of the cluster picked sub_label = sub_pred[idx]#used when choosing cluster by H sub_fn = fn[c_id] #sub-clustering the cluster
gnb.fit(train[newarr].values, train["Patient Condition"]) result = gnb.predict(test[newarr]) # Print Performance Indicator print( "Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%" .format( test.shape[0], (test["Patient Condition"] != result).sum(), 100 * (1 - (test["Patient Condition"] != result).sum() / test.shape[0]))) test_data = pd.concat([test[newarr], test["Patient Condition"]], axis=1) test_data["Patient Condition"] = result print(test_data) counts = ct(result) count_p = counts['Positive'] count_n = counts['Negative'] slices = [count_p, count_n] cols = ['b', 'c'] plt.pie(slices, labels=['Positve', 'Negative'], colors=cols, shadow=True, startangle=90, autopct='%1.1f%%') plt.title("Patient Condition") plt.legend() plt.show()
ora_idx.append(idx) #print k,label_md[idx],label[idx],cf_md[idx],input1[idx] #elif cf_md[v[itr][0]] <=0.2: #print '>>>>>',k,label_md[idx],label[idx],cf_md[idx],input1[idx] ex_ora.append(len(ora_idx)) ''' train_data = data1[train] train_label = label1[train] validate_data = data1[validate] validate_label = label1[validate] ''' train_data = fn[np.hstack((auto_idx,ora_idx))] train_label = np.hstack((label_md[auto_idx],label[ora_idx])) train_label_ = label[np.hstack((auto_idx,ora_idx))] print ct(train_label) clf.fit(train_data,train_label) acc = clf.score(test_data,test_label) #acc_sum[itr].append(acc) acc_H.append(acc) clf.fit(train_data,train_label_) acc = clf.score(test_data,test_label) acc_T.append(acc) ''' cm = CM(test_label,preds) cm = normalize(cm.astype(np.float), axis=1, norm='l1') k=0 while k<clx: acc_type[k].append(cm[k,k]) k += 1
y = 0 z = 0 for i in teslaSummaryPOS: teslaSummaryTags.append(teslaSummaryPOS[x][1]) x += 1 for i in NYTimesPOS: NYTimesTags.append(NYTimesPOS[y][1]) y += 1 for i in ESPNPOS: ESPNTags.append(ESPNPOS[z][1]) z += 1 # In[92]: #count the tags teslaSumTagCount = ct(teslaSummaryTags) NYTimesTagCount = ct(NYTimesTags) ESPNTagCount = ct(ESPNTags) # In[93]: #sublinear normalization for key in teslaSumTagCount: x = teslaSumTagCount[key] if x > 0: teslaSubLinearNorm[key] = 1 + math.log(x) for key in NYTimesTagCount: x = NYTimesTagCount[key] if x > 0: NYTimesSubLinearNorm[key] = 1 + math.log(x) for key in ESPNTagCount:
def func_p(): x = 1 file_name = sd.askstring("File Name", "Enter your file name ") data = pd.read_csv(file_name) while x == 1: if file_name is not None: x = 0 si("", "File Loaded!") #mapping data data["Sex_cleaned"] = np.where( data["Sex"] == "M", "Male", (np.where(data["Sex"] == "F", "Female", "Infant"))) data = data[[ "Sex_cleaned", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings" ]].dropna(axis=0, how='any') #Split dataset train, test = train_test_split(data, test_size=0.6, random_state=int(4)) gnb = GaussianNB() indicators = [ "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings" ] gnb.fit(train[indicators].values, train["Sex_cleaned"]) y_pred = gnb.predict(test[indicators]) #Print Performance Indicator data_accuracy = ( "Total data {} points : {}, Accuracy {:05.2f}%".format( test.shape[0], (test["Sex_cleaned"] != y_pred).sum(), 100 * (1 - (test["Sex_cleaned"] != y_pred).sum() / test.shape[0]))) test_data = pd.concat([test[indicators], test["Sex_cleaned"]], axis=1) test_data["Sex Prediction"] = y_pred test_data["Data Accuracy"] = data_accuracy #Excel Writer writer = pd.ExcelWriter('Output.xlsx', engine='xlsxwriter') test_data.to_excel(writer, sheet_name='Sheet1') writer.save() si("", "Output Created! Check it out!") answer = mb.askyesno("Question", "Do you want to check the Data Chart?") if answer == True: #Counter counts = ct(y_pred) count_Male = counts['Male'] count_Female = counts['Female'] count_Infant = counts['Infant'] slices = [count_Male, count_Female, count_Infant] cols = ['#00ffc3', '#ff00cb', '#ffd000'] #Pie Chart fig = Figure(figsize=(100, 100)) a = fig.add_subplot(111) a.pie(slices, labels=['Male', 'Female', 'Infant'], colors=cols, shadow=True, startangle=90, autopct='%1.1f%%') a.legend() canvas = FigureCanvasTkAgg(fig) canvas.get_tk_widget().pack(fill=BOTH, expand=True) return fig FigureCanvasTk.draw() else: root.destroy() else: sw("Warning", "Please re - enter your input")
def parentheses(s): c = ct(s) if c["("] == c[")"]: return True return False
j += 1 k += 1 if k == 512: listaFrequenciaY.append(j) k = 0 j = 0 # calcula frequencias Z for i in range(len(freqIdaZ)): if freqIdaZ[i] == freqVoltaZ[i]: j += 1 k += 1 if k == 512: listaFrequenciaZ.append(j) k = 0 j = 0 contagemFrequenciaX = ct(listaFrequenciaX) media = 0.0 variancia = 0.0 desvio = 0.0 qtd = 0.0 eventoFrequencia = [] #print resultados print(ninho + '-' + tratamento) print( 'Freq X - media: ' + str(format(calcMedia(listaFrequenciaX), '.2f')), 'desvio: ' + str(format(calcDesvio(listaFrequenciaX), '.2f')), 'min: ' + str(min(listaFrequenciaX)), 'max: ' + str(max(listaFrequenciaX))) print( 'Freq Y - media: ' +
featureLabelFile = "../../dataset/processed_acl/processedBooksElectronics/" + dataName featureMatrix, labelList = readFeatureLabel(featureLabelFile) featureMatrix = np.array(featureMatrix) labelArray = np.array(labelList) ###processedKitchenElectronics transferLabel_electronics--kitchen.txt transferLabelFile = "../../dataset/processed_acl/processedBooksElectronics/transferLabel_books--electronics.txt" auditorLabelList, transferLabelList, targetLabelList = readTransferLabel( transferLabelFile) transferLabelArray = np.array(transferLabelList) auditorLabelArray = np.array(auditorLabelList) # print(auditorLabel) # exit() # label = np.array([float(i.strip()) for i in open('targetAuditorLabel.txt').readlines()]) # tmp = np.genfromtxt('../../data/rice_hour_sdh', delimiter=',') # label = tmp[:,-1] print('class count of true labels of all ex:\n', ct(labelArray)) print("count of auditor", ct(auditorLabelArray)) # exit() # mapping = {1:'co2',2:'humidity',4:'rmt',5:'status',6:'stpt',7:'flow',8:'HW sup',9:'HW ret',10:'CW sup',11:'CW ret',12:'SAT',13:'RAT',17:'MAT',18:'C enter',19:'C leave',21:'occu'} # fn = get_name_features(raw_pt) fold = 10 rounds = 100 al = active_learning(fold, rounds, featureMatrix, auditorLabelArray) al.run_CV()
f.close() return featureMatrix, label if __name__ == "__main__": ###processedKitchenElectronics electronics ---> kitchen ###processedBooksElectronics books ---> electronics sourceFeatureLabelFile = "../../dataset/processed_acl/processedBooksElectronics/"+sourceDataName sourceFeatureMatrix, sourceLabelList = readFeatureLabel(sourceFeatureLabelFile) sourceLabel = np.array(sourceLabelList) sourceFeatureMatrix = np.array(sourceFeatureMatrix) print('class count of true source labels of all ex:\n', ct(sourceLabel)) targetFeatureLabelFile = "../../dataset/processed_acl/processedBooksElectronics/"+targetDataName targetFeatureMatrix, targetLabelList = readFeatureLabel(targetFeatureLabelFile) targetLabel = np.array(targetLabelList) targetFeatureMatrix = np.array(targetFeatureMatrix) print('class count of true target labels of all ex:\n', ct(targetLabel)) fold = 1 rounds = 100 al = active_learning(fold, rounds, sourceFeatureMatrix, sourceLabel, targetFeatureMatrix, targetLabel) dataDir = "../../dataset/processed_acl/processedBooksElectronics/"
def count_Platform(json_data): platforms = ct(k.platform for k in json_data if k.platform) # for publisher, count in publishers.most_common(): # print(publisher, count) return platforms
label2 = input2[:,-1] label = input4[:,-1] #input3, label = shuffle(input3, label) name = [] for i in input3: s = re.findall('(?i)[a-z]{2,}',i) name.append(' '.join(s)) vc = CV(analyzer='char_wb', ngram_range=(3,4), min_df=1, token_pattern='[a-z]{2,}') #vc = TV(analyzer='char_wb', ngram_range=(3,4), min_df=1, token_pattern='[a-z]{2,}') fn = vc.fit_transform(name).toarray() fd = input4[:,[0,1,2,3,5,6,7]] #n_class = len(np.unique(label)) #print n_class #print np.unique(label) print ct(label) #kmer = vc.get_feature_names() #idf = zip(kmer, vc._tfidf.idf_) #idf = sorted(idf, key=lambda x: x[-1], reverse=True) #print idf[:20] #print idf[-20:] #print vc.get_feature_names() fold = 2 kf = StratifiedKFold(label, n_folds=fold, shuffle=True) #kf = KFold(len(label), n_folds=fold, shuffle=True) ''' folds = [[] for i in range(fold)] i = 0 for train, test in kf: folds[i] = test