def intersect(self, nums1, nums2):
     c3 = ct(nums1)&ct(nums2)
     print(c3, ct(nums1),ct(nums2))
     ret = []
     for ch in c3:
         ret += ([ch]*c3[ch])
     return ret    
Пример #2
0
 def shortestCompletingWord(self, licensePlate, words):
     ans = None
     license = ct(i.lower() for i in licensePlate if i.isalpha())
     for i in words:
         tmp = ct(i)
         if all(k in tmp and tmp[k] >= v for k, v in license.items()):
             ans = ans if ans and len(i) >= len(ans) else i
     return ans
Пример #3
0
    def isAnagram(self, s, t):
        """
        :type s: str
        :type t: str
        :rtype: bool
        """
        from collections import Counter as ct

        sct = ct(s)
        tct = ct(t)
        return sct == tct
Пример #4
0
def create_3K_lic_rpt(input_file, output_file, smart_account):
    '''Creates a CSV formatted Report of 3K licensing content from a file input
    of a CCW-R file export'''
    with open(input_file, 'r') as f:
        rl = f.readlines()
    ### Find CCW-R header row to place into a list
    header = [i for i in rl if ldos(i)]
    ###Parse CCW-R lines with any 3x50 SKUs into a list of rows
    dev_3x50 = [i for i in rl if is_3x50(i)]
    ###Parse CCW-R lines for traditional top-level SKU rows
    non_C1_dev = [i for i in dev_3x50 if non_C1_3x50(i)]
    ###Parse CCW-R lines for individual on-box SW upgrade licensing rows
    upg_lics = [i for i in dev_3x50 if lic_C1_3x50(i)]
    ###Parse C1 SKUs for 3Ks less than 12 ports b/c SW licenses appear in top-level
    non_24_48_port = [i for i in dev_3x50 if non_24_48_port_C1(i)]
    ###Concatenate all parsed lists
    parsed_ccwr_rows_list = header + non_C1_dev + upg_lics + non_24_48_port
    ###Perform count of elements in concatenated list and place in dict
    devdict = dict(ct([(i.split(','))[0] for i in parsed_ccwr_rows_list][1:]))
    ###Extract top-level SKUs and convert to list of actual licensing SKU that appear in CSSM.
    C3x50 = [((i.split(','))[0][3:11] + '-' + (i.split(','))[0][-1])
             for i in parsed_ccwr_rows_list
             if i.split(',')[0].startswith('WS-C3')]
    C3x50 = C3x50 + [((i.split(','))[0][:12].replace(
        (i.split(','))[0][:5], 'C') + '-' + (i.split(','))[0][-1])
                     for i in parsed_ccwr_rows_list
                     if i.split(',')[0].startswith('C1-WS')]
    C3x50_E = [i.replace(i[-2:], '-S-E') for i in C3x50 if i.endswith('E')]
    C3x50_S = [i.replace(i[-2:], '-L-S') for i in C3x50 if i.endswith('S')]
    ###Extract top-level upgrade license SKUs and convert to list
    upg_lics_indiv = [i.split(',')[0] for i in upg_lics]
    ###Concatenate license lists
    total_upg_lics = C3x50_E + C3x50_S + upg_lics_indiv
    ###Perform count of elements in concatenated list and place in dict
    licdict = dict(ct(total_upg_lics))
    ###Create output file
    with open(output_file, 'w') as f:
        f.write('Top-Level Device OR License,-----,Count\n')
        for i in devdict:
            f.write(i + ',-----,' + str(devdict[i]) + '\n')
        f.write(4 * '\n')
        f.write("LICENSES to be deposited in %s\n\n" % smart_account +
                'License,-----,Count\n')
        for i in licdict:
            f.write(i + ',-----,' + str(licdict[i]) + '\n')
        f.write(4 * '\n')
        f.write("Full License/Device Breakout from CCW-R\n\n")
        for i in parsed_ccwr_rows_list:
            f.write(i)
Пример #5
0
def meu_knn(dados_train, rotulo_train, dados_teste, k, normalizar):
    rotulos = []

    if normalizar:
        # aplica normalização aos dados
        dados_teste = normalizacao(dados_teste)
        dados_train = normalizacao(dados_train)

    for i in range(len(dados_teste)):
        distancia_teste_train = []
        for j in range(len(dados_train)):
            # calcula a distância entre o elemento i de teste e o elemento j de treinamento
            distancia_teste_train.append(dist(dados_teste[i], dados_train[j], len(dados_teste[i])))

        # ordena a lista de distância e rótulos juntos para que fiquem com os mesmos índices nos 
        # elementos correspondentes; então, seleciona a lista com os rótulos já ordenados
        rotulos_ordenados = [y for x, y in sorted(zip(distancia_teste_train, rotulo_train))]

        # salva somente a coluna com os rótulos, que é o valor interessante para o problema
        rotulos_ordenados = [row[0] for row in rotulos_ordenados]

        # seleciona os k primeiros elementos dos rótulos, calcula a quantidade de ocorrências 
        # para cada valor e pega o valor mais comum encontrado
        rotulos.append(ct(rotulos_ordenados[:k]).most_common(1)[0][0])
    
    return rotulos
def removeDuplicates(data: list) -> list:
    output = data[::-1]
    count = ct(data)
    for i in [k for k, v in count.items() if v > 2]:
        for _ in range(count[i] - 2):  # 需删除的次数
            output.remove(i)
    print(output[::-1])
    return output[::-1]
Пример #7
0
 def topKFrequent1(self, nums, k):
     """
     :type nums: List[int]
     :type k: int
     :rtype: List[int]
     """
     from collections import Counter as ct
     return [k for (k,v) in ct(nums).most_common(k)] 
Пример #8
0
    def findLucky(self, arr: List[int]) -> int:
        C = ct(arr)

        ret = -1
        for c in C :
            if c == C[c] :
                ret = max(ret, c)
        
        return ret
Пример #9
0
def top_publishers(publishers, top):
    top_ten = set((""))
    top_ten = dict(ct(publishers).most_common(top))
    #for publisher, count in publishers.most_common():

    print(top_ten)
    print(len(top_ten))

    return top_ten
Пример #10
0
def top_Platforms(platforms, top):
    top_platforms = set((""))
    top_platforms = dict(ct(platforms).most_common(top))
    #for publisher, count in publishers.most_common():

    print(top_platforms)
    print(len(top_platforms))

    return top_platforms
def top_publishers(publishers,
                   top):  #deletes publishers with less than min number
    top_ten = set((""))
    top_ten = dict(ct(publishers).most_common(top))
    #for publisher, count in publishers.most_common():

    print(top_ten)
    print(len(top_ten))

    return top_ten
Пример #12
0
 def customSortString(self, S, T):
     """
     :type S: str
     :type T: str
     :rtype: str
     """
     t = set(T)
     t2 = set(S)
     from collections import Counter as ct
     c = ct(T)
     s = [char * c[char] for char in S if char in t]
     add = [char * c[char] for char in t - t2]
     return "".join(s + add)
Пример #13
0
def f(l):
    #print l
    if len(l) == 1:
        return l[0]
    m = min(x[0] for x in l)
    two = filter(lambda x: x[0] == m, l)
    #print two,m
    if len(two) == 2:
        rest = filter(lambda x: x[0] != m, l)
        c = ct(two[0][1:] + two[1][1:])
        c -= ct([x[0] for x in rest])
        assert len(c) == 1
        return list(c.elements()) + f(sorted([x[1:] for x in rest]))
    else:
        m = max(x[-1] for x in l)
        two = filter(lambda x: x[-1] == m, l)
        rest = filter(lambda x: x[-1] != m, l)
        assert len(two) == 2
        c = ct(two[-1][:-1] + two[-2][:-1])
        c -= ct([x[-1] for x in rest])
        assert len(c) == 1
        return f(sorted([x[:-1] for x in rest])) + list(c.elements())
Пример #14
0
    def select_example(self):

        sub_pred = dd(list) #Mn predicted labels for each cluster
        idx = 0

        for k,v in self.ex_id.items():
            sub_pred[k] = self.clf.predict(self.fn[v]) #predict labels for cluster learning set

        #entropy-based cluster selection
        rank = []
        for k,v in sub_pred.items():
            count = list(ct(v).values())
            count[:] = [i/float(max(count)) for i in count]
            H = np.sum(-p*math.log(p,2) for p in count if p!=0)
            rank.append([k,len(v),H])
        rank = sorted(rank, key=lambda x: x[-1], reverse=True)

        if not rank:
            raise ValueError('no clusters found in this iteration!')

        c_idx = rank[0][0] #pick the 1st cluster on the rank, ordered by label entropy
        c_ex_id = self.ex_id[c_idx] #examples in the cluster picked
        sub_label = sub_pred[c_idx] #used when choosing cluster by H
        sub_fn = self.fn[c_ex_id]

        #sub-cluster the cluster
        c_ = KMeans(init='k-means++', n_clusters=len(np.unique(sub_label)), n_init=10)
        c_.fit(sub_fn)
        dist = np.sort(c_.transform(sub_fn))

        ex_ = dd(list)
        for i,j,k,l in zip(c_.labels_, c_ex_id, dist, sub_label):
            ex_[i].append([j,l,k[0]])
        for i,j in ex_.items(): #sort by ex. dist to the centroid for each C
            ex_[i] = sorted(j, key=lambda x: x[-1])
        for k,v in ex_.items():

            if v[0][0] not in self.labeled_set: #find the first unlabeled ex

                idx = v[0][0]
                break

        return idx, c_idx
def topKFrequent(nums, k):
    from collections import Counter as ct
    return [num for (num, count) in ct(nums).most_common(k)]
Пример #16
0
z = 0
for i in teslaSummaryPOS:
    teslaSummaryTags.append(teslaSummaryPOS[x][1])
    x += 1
for i in NYTimesPOS:
    NYTimesTags.append(NYTimesPOS[y][1])
    y += 1
for i in ESPNPOS:
    ESPNTags.append(ESPNPOS[z][1])
    z +=1


# In[92]:

#count the tags
teslaSumTagCount = ct(teslaSummaryTags)
NYTimesTagCount = ct(NYTimesTags)
ESPNTagCount = ct(ESPNTags)


# In[93]:

#sublinear normalization
for key in teslaSumTagCount:
    x = teslaSumTagCount[key]
    if x > 0:
        teslaSubLinearNorm[key] =  1 + math.log(x)
for key in NYTimesTagCount:
    x = NYTimesTagCount[key]
    if x > 0:
        NYTimesSubLinearNorm[key] =  1 + math.log(x)
Пример #17
0
def count_publisher(json_data):

    publishers = ct(k['publisher'] for k in json_data if k.get('publisher'))
    # for publisher, count in publishers.most_common():
    # print(publisher, count)
    return publishers
Пример #18
0
    else:
        print '# of p label', len(p_label)
        print 'p label acc', sum(label[p_idx]==p_label)/float(len(p_label))

    #print 'ex before 30 itr', ct(ex_30)
    #print 'ex after 50 itr', ct(ex_50)
    #print 'ex all', ct(ex_all)

cm_cls = np.unique(np.hstack((test_label,preds)))
f = open('al_out','w')
f.writelines('%s;\n'%repr(i) for i in tp_type)
f.write('ex in each itr:'+repr(ex)+'\n')
f.write(repr(cm_cls))
f.close()

print 'f count on all ex', ct(label)
ave_acc = [np.mean(acc) for acc in acc_sum]
acc_std = [np.std(acc) for acc in acc_sum]

'''
ave_acc_type = [[] for i in range(6)]
ave_pre = [[] for i in range(6)]
ave_rec = [[] for i in range(6)]
for i in range(6):
    ave_acc_type[i] = [np.mean(a) for a in acc_type[i]]
    ave_pre[i] = [np.mean(p) for p in precision_type[i]]
    ave_rec[i] = [np.mean(r) for r in recall_type[i] ]
'''
print 'overall acc:', repr(ave_acc)
print 'p1',p1
print np.mean(p1)
Пример #19
0
from collections import Counter as ct
l=["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"]
g=map(str,range(0,10))
def f(a,b):
    tp=ct()
    for i in b.keys():
        tp[i]=a*b[i]
    return tp
#l=map(lambda x:ct(x),l)
for i in range(int(raw_input())):
    c=ct(raw_input())
    d=ct()
    ans=[0]*10
    ans[0]=c['Z']
    ans[2]=c['W']
    ans[8]=c['G']
    ans[6]=c['X']
    ans[4]=c['U']
    ans[3]=c['H']-ans[8]
    ans[5]=c['F']-ans[4]
    ans[7]=c['V']-ans[5]
    ans[1]=c['O']-ans[2]-ans[4]-ans[0]
    ans[9]=(c['N']-ans[1]-ans[7])/2
    #print ans
    assert(len(filter(lambda a:a>=0,ans))==10)
    assert(ct("".join(map(lambda a,b: a*b,ans,l)))==c)
    print "Case #{}: {}".format(str(i+1),"".join(map(lambda a,b:a*b,ans,g)))
Пример #20
0
label1 = input2[:, -1]
label = input4[:, -1]
label1 = input6[:, -1]
# input3 = input3 #quick run of the code using other building
# input3, label = shuffle(input3, label)
name = []
for i in input3:
    s = re.findall("(?i)[a-z]{2,}", i)
    name.append(" ".join(s))

cv = CV(analyzer="char_wb", ngram_range=(3, 4))
# tv = TV(analyzer='char_wb', ngram_range=(3,4))
fn = cv.fit_transform(name).toarray()
# fn = cv.fit_transform(input1).toarray()
# print cv.vocabulary_
print "class count of true labels of all ex:\n", ct(label)
# n_class = len(np.unique(label))
# print n_class
# print np.unique(label)
# print 'class count from groud truth labels:\n',ct(label)

fold = 10
rounds = 100
# clf = LR()
# clf = LinearSVC()
# clf = SVC(kernel='linear', probability=True)
clf = RFC(n_estimators=100, criterion="entropy")

# kf = StratifiedKFold(label, n_folds=fold, shuffle=True)
kf = KFold(len(label), n_folds=fold, shuffle=True)
mapping = {
Пример #21
0
def create_3K_lic_rpt(ccwr_full_list, output_file, smart_account):
    '''Create a group on lambda functions to perform RegEx searches'''
    ###Find Header Row based on Serial keyword.
    hr = lambda s: search('.*[Ss][Ee][Rr][Ii][Aa].*', s)
    ###Find any SKU containing 3K nomenclature.
    is_3x50 = lambda s: search('3[68]50', s)
    ###Find any top level traditonal hardware SKU that also contains license level.
    non_C1_3x50 = lambda s: search('WS-C3[68]50.*-[SE]', s)
    ###Find individual 3K on-box license SKUs.
    lic_C1_3x50 = lambda s: search('C3[68]50-[24][48]-[SL]-[ES]', s)
    ###Find any C1 SKU that is less than 24 ports. These have license level as part of the top-level part.
    non_24_48_port_C1 = lambda s: search('C1-WS.*-12.*', s)
    '''Creates a CSV formatted Report of 3K licensing content from a file input
    of a CCW-R file export'''

    #print(dumps(ccwr_full_list[0:5],indent=4))

    ### Find CCW-R header row to place into a list
    header = [i for i in ccwr_full_list[0:3] if hr(str(i))]
    ###Parse CCW-R lines with any 3x50 SKUs into a list of rows
    dev_3x50 = [i for i in ccwr_full_list if is_3x50(str(i))]
    ###Parse CCW-R lines for traditional top-level SKU rows
    non_C1_dev = [i for i in dev_3x50 if non_C1_3x50(str(i))]
    #print(dumps(non_C1_dev,indent=4))
    ###Parse CCW-R lines for individual on-box SW upgrade licensing rows
    upg_lics = [i for i in dev_3x50 if lic_C1_3x50(str(i))]
    #print(dumps(upg_lics,indent=4))
    ###Parse C1 SKUs for 3Ks less than 12 ports b/c SW licenses appear in top-level
    non_24_48_port = [i for i in dev_3x50 if non_24_48_port_C1(str(i))]
    #print(dumps(non_24_48_port,indent=4))
    ###Concatenate all parsed lists
    parsed_ccwr_rows_list = header + non_C1_dev + upg_lics + non_24_48_port
    #print(dumps(parsed_ccwr_rows_list,indent=4))
    ###Perform count of elements in concatenated list and place in dict
    devdict = dict(ct([i[0] for i in parsed_ccwr_rows_list][1:]))
    ###Extract top-level SKUs and convert to list of actual licensing SKU that appear in CSSM.
    C3x50 = [
        i[0][3:11] + '-' + i[0][-1] for i in parsed_ccwr_rows_list
        if i[0].startswith('WS-C3')
    ]
    C3x50 = C3x50 + [
        i[0][:12].replace(i[0][:5], 'C' + '-' + i[0][-1])
        for i in parsed_ccwr_rows_list if i[0].startswith('C1-WS')
    ]
    C3x50_E = [i.replace(i[-2:], '-S-E') for i in C3x50 if i.endswith('E')]
    C3x50_S = [i.replace(i[-2:], '-L-S') for i in C3x50 if i.endswith('S')]
    ###Extract top-level upgrade license SKUs and convert to list
    upg_lics_indiv = [i[0] for i in upg_lics]
    ###Concatenate license lists
    total_upg_lics = C3x50_E + C3x50_S + upg_lics_indiv
    ###Perform count of elements in concatenated list and place in dict
    licdict = dict(ct(total_upg_lics))
    ###Create output file
    with open(output_file, 'w') as f:
        f.write('Top-Level Device OR License,-----,Count\n')
        for i in devdict:
            f.write(i + ',-----,' + str(devdict[i]) + '\n')
        f.write(4 * '\n')
        f.write("LICENSES to be deposited in %s\n\n" % smart_account +
                'License,-----,Count\n')
        for i in licdict:
            f.write(i + ',-----,' + str(licdict[i]) + '\n')
        f.write(4 * '\n')
        f.write("Full License/Device Breakout from CCW-R\n\n")
        for i in parsed_ccwr_rows_list:
            for j in i:
                f.write(j)
                f.write(',')
            f.write('\n')
Пример #22
0
		featureLabelFile = "../../dataset/processed_acl/processedBooksKitchenElectronics/"+dataName

		featureMatrix, labelList = readFeatureLabel(featureLabelFile)

		# transferLabelFile = "../../dataset/processed_acl/processedBooksKitchenElectronics/transferLabel_books--electronics.txt"

		transferLabelFile = "../../dataset/processed_acl/processedBooksKitchenElectronics/transferLabel_kitchen--electronics.txt"
		auditorLabelList, transferLabelList, trueLabelList = readTransferLabel(transferLabelFile)

		featureMatrix = np.array(featureMatrix)
		labelArray = np.array(labelList)

		transferLabelArray = np.array(transferLabelList)
		print("number of types", len(set(labelArray)))
		print('class count of true labels of all ex:\n', ct(transferLabelArray))


		initialExList = [[397, 1942, 200], [100, 1978, 657], [902, 788, 1370], [1688, 1676, 873], [1562, 1299, 617], [986, 1376, 562], [818, 501, 1922], [600, 1828, 1622], [1653, 920, 1606], [39, 1501, 166]]

		fold = 10
		rounds = 150

		multipleClassFlag = False
		al = _ProactiveLearning(fold, rounds, featureMatrix, labelArray, transferLabelArray, "sentiment_electronics", multipleClassFlag)

		al.setInitialExList(initialExList)

		al.run_CV()

	"""
Пример #23
0
    def run_CV(self, batchSize):

        cvIter = 0

        totalInstanceNum = len(self.label)
        print("totalInstanceNum\t", totalInstanceNum)
        indexList = [i for i in range(totalInstanceNum)]

        totalTransferNumList = []
        random.shuffle(indexList)

        foldNum = 10
        foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum)
        foldInstanceList = []

        for foldIndex in range(foldNum - 1):
            foldIndexInstanceList = indexList[foldIndex *
                                              foldInstanceNum:(foldIndex + 1) *
                                              foldInstanceNum]
            foldInstanceList.append(foldIndexInstanceList)

        foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):]
        foldInstanceList.append(foldIndexInstanceList)

        cvIter = 0
        totalAccList = [[] for i in range(10)]
        for foldIndex in range(foldNum):
            # self.clf = LinearSVC(random_state=3)
            # self.clf = LR(fit_intercept=False)
            self.clf = LR(multi_class="multinomial", solver='lbfgs')

            train = []
            for preFoldIndex in range(foldIndex):
                train.extend(foldInstanceList[preFoldIndex])

            test = foldInstanceList[foldIndex]
            for postFoldIndex in range(foldIndex + 1, foldNum):
                train.extend(foldInstanceList[postFoldIndex])

            trainNum = int(totalInstanceNum * 0.9)

            fn_test = self.fn[test]
            label_test = self.label[test]
            print("testing", ct(label_test))

            fn_train = self.fn[train]

            initExList = []
            initExList = self.pretrainSelectInit(train)
            # initExList = [316, 68, 495]

            # random.seed(110)
            # initExList = random.sample(train, 3)
            fn_init = self.fn[initExList]
            label_init = self.label[initExList]

            print("initExList\t", initExList, label_init)
            queryIter = 3
            labeledExList = []
            unlabeledExList = []
            ###labeled index
            labeledExList.extend(initExList)
            unlabeledExList = list(set(train) - set(labeledExList))

            while queryIter < rounds:
                fn_train_iter = []
                label_train_iter = []

                fn_train_iter = self.fn[labeledExList]
                label_train_iter = self.label[labeledExList]

                self.clf.fit(fn_train_iter, label_train_iter)

                idxList = self.select_example(unlabeledExList, batchSize)
                # print(queryIter, "idx", idx, self.label[idx])
                # self.update_select_confidence_bound(idx)
                if len(idxList) > 0:
                    labeledExList += idxList
                    for idx in idxList:
                        unlabeledExList.remove(idx)
                else:
                    labeledExList.append(idxList)
                    unlabeledExList.remove(idxList)

                acc = self.get_pred_acc(fn_test, label_test, labeledExList)
                totalAccList[cvIter].append(acc)
                queryIter += 1

            cvIter += 1

        totalACCFile = modelVersion + "_acc.txt"
        f = open(totalACCFile, "w")
        for i in range(10):
            totalAlNum = len(totalAccList[i])
            for j in range(totalAlNum):
                f.write(str(totalAccList[i][j]) + "\t")
            f.write("\n")
        f.close()
Пример #24
0
def parentheses(s):
    c = ct(s)
    if c['('] == c[')']:
        return True
    return False
Пример #25
0
from sklearn import tree
from sklearn.preprocessing import normalize

input1 = [i.strip().split('+')[-1][:-5] for i in open('sdh_pt_rice').readlines()]
input21 = np.genfromtxt('keti_hour_sum', delimiter=',')
input22 = np.genfromtxt('sdh_hour_rice', delimiter=',')
input2 = np.vstack((input21,input22))
#input2 = np.genfromtxt('sdh_45min_forrice', delimiter=',')
input3 = [i.strip().split('\\')[-1][:-5] for i in open('rice_pt_sdh').readlines()]
input4 = np.genfromtxt('rice_hour_sdh', delimiter=',')
input5 = [i.strip().split('_')[-1][:-5] for i in open('soda_pt_new').readlines()]
input6 = np.genfromtxt('soda_45min_new', delimiter=',')
label1 = input2[:,-1]
label = input4[:,-1]
label1 = input6[:,-1]
print 'class count of true labels of all ex:\n', ct(label)
#input3 = input3 #quick run of the code using other building
name = []
for i in input3:
    s = re.findall('(?i)[a-z]{2,}',i)
    name.append(' '.join(s))

cv = CV(analyzer='char_wb', ngram_range=(3,4))
#tv = TV(analyzer='char_wb', ngram_range=(3,4))
fn = cv.fit_transform(name).toarray()
#fn = cv.fit_transform(input1).toarray()
#print cv.vocabulary_
#fd = input4[:,[0,1,2,3,5,6,7]]
#kmer = cv.get_feature_names()
#idf = zip(kmer, cv._tfidf.idf_)
#idf = sorted(idf, key=lambda x: x[-1], reverse=True)
Пример #26
0
def f(a,b):
    tp=ct()
    for i in b.keys():
        tp[i]=a*b[i]
    return tp
		featureMatrix.append(featureList)
		if line[lineLen-1] == "FALSE":
			label.append(0.0)
		else:
			# print(line[lineLen-1])
			label.append(1.0)

	f.close()

	return featureMatrix, label

if __name__ == "__main__":
	# featureLabelFile = "../../dataset/processed_acl/processedKitchenElectronics/"+dataName
	featureLabelFile = "../../dataset/processed_acl/processedBooksElectronics/"+dataName

	featureMatrix, labelList = readFeatureLabel(featureLabelFile)
	featureMatrix = np.array(featureMatrix)
	labelArray = np.array(labelList)
	print('class count of true labels of all ex:\n', ct(labelArray))

	transferLabelFile = "../../dataset/processed_acl/processedBooksElectronics/transferLabel_books--electronics.txt"
	auditorLabelList, transferLabelList, targetLabelList = readTransferLabel(transferLabelFile)
	transferLabelArray = np.array(transferLabelList)
	

	fold = 10
	rounds = 100
	al = active_learning(fold, rounds, featureMatrix, transferLabelArray, labelArray)

	al.run_CV()
Пример #28
0
        #print 'ct on traing label', ct(train_label)
        clf.fit(train_fn, train_label)
        sub_pred = dd(list) #Mn predicted labels for each cluster
        for k,v in ex_id.items():
            sub_pred[k] = clf.predict(fn[v]) #predict labels for cluster learning set

        #acc_ = accuracy_score(label[train_], preds_c)
        #print 'acc on test set', acc
        #print 'acc on cluster set', acc_
        #acc_sum[rr].append(acc)
        #print 'iteration', rr, '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'

        #the original H based cluster selection
        rank = []
        for k,v in sub_pred.items():
            count = ct(v).values()
            count[:] = [i/float(max(count)) for i in count]
            H = np.sum(-p*math.log(p,2) for p in count if p!=0)
            #H /= len(v)/float(len(train))
            rank.append([k,len(v),H])
        rank = sorted(rank, key=lambda x: x[-1], reverse=True)
        if not rank:
            break
        idx = rank[0][0] #pick the id of the 1st cluster on the rank
        cl_id.append(idx) #track cluster id on each iteration
        cc = idx #id of the cluster picked by H
        c_id = ex_id[cc] #example id of the cluster picked
        sub_label = sub_pred[idx]#used when choosing cluster by H
        sub_fn = fn[c_id]

        #sub-clustering the cluster
Пример #29
0
gnb.fit(train[newarr].values, train["Patient Condition"])
result = gnb.predict(test[newarr])

# Print Performance Indicator
print(
    "Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
    .format(
        test.shape[0], (test["Patient Condition"] != result).sum(), 100 *
        (1 - (test["Patient Condition"] != result).sum() / test.shape[0])))

test_data = pd.concat([test[newarr], test["Patient Condition"]], axis=1)
test_data["Patient Condition"] = result
print(test_data)

counts = ct(result)
count_p = counts['Positive']
count_n = counts['Negative']

slices = [count_p, count_n]
cols = ['b', 'c']
plt.pie(slices,
        labels=['Positve', 'Negative'],
        colors=cols,
        shadow=True,
        startangle=90,
        autopct='%1.1f%%')
plt.title("Patient Condition")
plt.legend()
plt.show()
Пример #30
0
                    ora_idx.append(idx)
                    #print k,label_md[idx],label[idx],cf_md[idx],input1[idx]
                #elif cf_md[v[itr][0]] <=0.2:
                    #print '>>>>>',k,label_md[idx],label[idx],cf_md[idx],input1[idx]

        ex_ora.append(len(ora_idx))
        '''
        train_data = data1[train]
        train_label = label1[train]
        validate_data = data1[validate]
        validate_label = label1[validate]
        '''
        train_data = fn[np.hstack((auto_idx,ora_idx))]
        train_label = np.hstack((label_md[auto_idx],label[ora_idx]))
        train_label_ = label[np.hstack((auto_idx,ora_idx))]
        print ct(train_label)

        clf.fit(train_data,train_label)
        acc = clf.score(test_data,test_label)
        #acc_sum[itr].append(acc)
        acc_H.append(acc)
        clf.fit(train_data,train_label_)
        acc = clf.score(test_data,test_label)
        acc_T.append(acc)
        '''
        cm = CM(test_label,preds)
        cm = normalize(cm.astype(np.float), axis=1, norm='l1')
        k=0
        while k<clx:
            acc_type[k].append(cm[k,k])
            k += 1
Пример #31
0
y = 0
z = 0
for i in teslaSummaryPOS:
    teslaSummaryTags.append(teslaSummaryPOS[x][1])
    x += 1
for i in NYTimesPOS:
    NYTimesTags.append(NYTimesPOS[y][1])
    y += 1
for i in ESPNPOS:
    ESPNTags.append(ESPNPOS[z][1])
    z += 1

# In[92]:

#count the tags
teslaSumTagCount = ct(teslaSummaryTags)
NYTimesTagCount = ct(NYTimesTags)
ESPNTagCount = ct(ESPNTags)

# In[93]:

#sublinear normalization
for key in teslaSumTagCount:
    x = teslaSumTagCount[key]
    if x > 0:
        teslaSubLinearNorm[key] = 1 + math.log(x)
for key in NYTimesTagCount:
    x = NYTimesTagCount[key]
    if x > 0:
        NYTimesSubLinearNorm[key] = 1 + math.log(x)
for key in ESPNTagCount:
Пример #32
0
def func_p():
    x = 1
    file_name = sd.askstring("File Name", "Enter your file name ")
    data = pd.read_csv(file_name)
    while x == 1:
        if file_name is not None:
            x = 0
            si("", "File Loaded!")

            #mapping data
            data["Sex_cleaned"] = np.where(
                data["Sex"] == "M", "Male",
                (np.where(data["Sex"] == "F", "Female", "Infant")))
            data = data[[
                "Sex_cleaned", "Length", "Diameter", "Height", "Whole weight",
                "Shucked weight", "Viscera weight", "Shell weight", "Rings"
            ]].dropna(axis=0, how='any')

            #Split dataset
            train, test = train_test_split(data,
                                           test_size=0.6,
                                           random_state=int(4))
            gnb = GaussianNB()
            indicators = [
                "Length", "Diameter", "Height", "Whole weight",
                "Shucked weight", "Viscera weight", "Shell weight", "Rings"
            ]
            gnb.fit(train[indicators].values, train["Sex_cleaned"])
            y_pred = gnb.predict(test[indicators])

            #Print Performance Indicator
            data_accuracy = (
                "Total data {} points : {}, Accuracy {:05.2f}%".format(
                    test.shape[0], (test["Sex_cleaned"] != y_pred).sum(), 100 *
                    (1 -
                     (test["Sex_cleaned"] != y_pred).sum() / test.shape[0])))
            test_data = pd.concat([test[indicators], test["Sex_cleaned"]],
                                  axis=1)
            test_data["Sex Prediction"] = y_pred
            test_data["Data Accuracy"] = data_accuracy

            #Excel Writer
            writer = pd.ExcelWriter('Output.xlsx', engine='xlsxwriter')
            test_data.to_excel(writer, sheet_name='Sheet1')
            writer.save()
            si("", "Output Created! Check it out!")
            answer = mb.askyesno("Question",
                                 "Do you want to check the Data Chart?")
            if answer == True:
                #Counter
                counts = ct(y_pred)
                count_Male = counts['Male']
                count_Female = counts['Female']
                count_Infant = counts['Infant']
                slices = [count_Male, count_Female, count_Infant]
                cols = ['#00ffc3', '#ff00cb', '#ffd000']
                #Pie Chart
                fig = Figure(figsize=(100, 100))
                a = fig.add_subplot(111)
                a.pie(slices,
                      labels=['Male', 'Female', 'Infant'],
                      colors=cols,
                      shadow=True,
                      startangle=90,
                      autopct='%1.1f%%')
                a.legend()
                canvas = FigureCanvasTkAgg(fig)
                canvas.get_tk_widget().pack(fill=BOTH, expand=True)
                return fig
                FigureCanvasTk.draw()
            else:
                root.destroy()
    else:
        sw("Warning", "Please re - enter your input")
Пример #33
0
def parentheses(s):
    c = ct(s)
    if c["("] == c[")"]:
        return True
    return False
Пример #34
0
                    j += 1
            k += 1
            if k == 512:
                listaFrequenciaY.append(j)
                k = 0
                j = 0
# calcula frequencias Z
        for i in range(len(freqIdaZ)):
            if freqIdaZ[i] == freqVoltaZ[i]:
                j += 1
            k += 1
            if k == 512:
                listaFrequenciaZ.append(j)
                k = 0
                j = 0
        contagemFrequenciaX = ct(listaFrequenciaX)
        media = 0.0
        variancia = 0.0
        desvio = 0.0
        qtd = 0.0
        eventoFrequencia = []
        #print resultados
        print(ninho + '-' + tratamento)
        print(
            'Freq X - media: ' +
            str(format(calcMedia(listaFrequenciaX), '.2f')),
            'desvio: ' + str(format(calcDesvio(listaFrequenciaX), '.2f')),
            'min: ' + str(min(listaFrequenciaX)),
            'max: ' + str(max(listaFrequenciaX)))
        print(
            'Freq Y - media: ' +
Пример #35
0
    featureLabelFile = "../../dataset/processed_acl/processedBooksElectronics/" + dataName

    featureMatrix, labelList = readFeatureLabel(featureLabelFile)
    featureMatrix = np.array(featureMatrix)
    labelArray = np.array(labelList)

    ###processedKitchenElectronics transferLabel_electronics--kitchen.txt
    transferLabelFile = "../../dataset/processed_acl/processedBooksElectronics/transferLabel_books--electronics.txt"
    auditorLabelList, transferLabelList, targetLabelList = readTransferLabel(
        transferLabelFile)
    transferLabelArray = np.array(transferLabelList)

    auditorLabelArray = np.array(auditorLabelList)
    # print(auditorLabel)
    # exit()
    # label = np.array([float(i.strip()) for i in open('targetAuditorLabel.txt').readlines()])

    # tmp = np.genfromtxt('../../data/rice_hour_sdh', delimiter=',')
    # label = tmp[:,-1]
    print('class count of true labels of all ex:\n', ct(labelArray))
    print("count of auditor", ct(auditorLabelArray))
    # exit()
    # mapping = {1:'co2',2:'humidity',4:'rmt',5:'status',6:'stpt',7:'flow',8:'HW sup',9:'HW ret',10:'CW sup',11:'CW ret',12:'SAT',13:'RAT',17:'MAT',18:'C enter',19:'C leave',21:'occu'}

    # fn = get_name_features(raw_pt)
    fold = 10
    rounds = 100
    al = active_learning(fold, rounds, featureMatrix, auditorLabelArray)

    al.run_CV()
	f.close()

	return featureMatrix, label

if __name__ == "__main__":

	###processedKitchenElectronics electronics ---> kitchen

	###processedBooksElectronics books ---> electronics
	sourceFeatureLabelFile = "../../dataset/processed_acl/processedBooksElectronics/"+sourceDataName
	sourceFeatureMatrix, sourceLabelList = readFeatureLabel(sourceFeatureLabelFile)

	sourceLabel = np.array(sourceLabelList)
	sourceFeatureMatrix = np.array(sourceFeatureMatrix)

	print('class count of true source labels of all ex:\n', ct(sourceLabel))

	targetFeatureLabelFile = "../../dataset/processed_acl/processedBooksElectronics/"+targetDataName
	targetFeatureMatrix, targetLabelList = readFeatureLabel(targetFeatureLabelFile)

	targetLabel = np.array(targetLabelList)
	targetFeatureMatrix = np.array(targetFeatureMatrix)

	print('class count of true target labels of all ex:\n', ct(targetLabel))

	fold = 1
	rounds = 100
	al = active_learning(fold, rounds, sourceFeatureMatrix, sourceLabel, targetFeatureMatrix, targetLabel)

	dataDir = "../../dataset/processed_acl/processedBooksElectronics/"
Пример #37
0
def count_Platform(json_data):

    platforms = ct(k.platform for k in json_data if k.platform)
    # for publisher, count in publishers.most_common():
    # print(publisher, count)
    return platforms
Пример #38
0
label2 = input2[:,-1]
label = input4[:,-1]
#input3, label = shuffle(input3, label)
name = []
for i in input3:
    s = re.findall('(?i)[a-z]{2,}',i)
    name.append(' '.join(s))

vc = CV(analyzer='char_wb', ngram_range=(3,4), min_df=1, token_pattern='[a-z]{2,}')
#vc = TV(analyzer='char_wb', ngram_range=(3,4), min_df=1, token_pattern='[a-z]{2,}')
fn = vc.fit_transform(name).toarray()
fd = input4[:,[0,1,2,3,5,6,7]]
#n_class = len(np.unique(label))
#print n_class
#print np.unique(label)
print ct(label)
#kmer = vc.get_feature_names()
#idf = zip(kmer, vc._tfidf.idf_)
#idf = sorted(idf, key=lambda x: x[-1], reverse=True)
#print idf[:20]
#print idf[-20:]
#print vc.get_feature_names()

fold = 2
kf = StratifiedKFold(label, n_folds=fold, shuffle=True)
#kf = KFold(len(label), n_folds=fold, shuffle=True)
'''
folds = [[] for i in range(fold)]
i = 0
for train, test in kf:
    folds[i] = test