Пример #1
0
def GTest_CI(X, Y, Z):
    g = 0
    sig_level_indep = 0.05

    hm_x = len(np.unique(X))
    hm_y = len(np.unique(Y))
    hm_z = len(np.unique(Z))

    hm_samples = X.size

    if Z.size == 0:
        return GTest_I(X, Y)
    else:
        #        if (len(Z.shape)>1 and Z.shape[1]>1):
        #            Z = joint(Z)
        #        states = np.unique(Z)
        #        for i in states:
        #            pattern = i
        #            sub_cond_idx = np.where(Z == pattern)
        #            temp_mi = mi(X[sub_cond_idx], Y[sub_cond_idx],0)
        #            g = g + sub_cond_idx.length*temp_mi
        g = 2 * hm_samples * cmi(X, Y, Z)
        p_val = 1 - chi2.cdf(g, (hm_x - 1) * (hm_y - 1) * hm_z)

        if p_val < sig_level_indep:
            Independency = 0  # reject the Null-hypothesis
        else:
            Independency = 1

    return Independency
Пример #2
0
def Large_Scale_IPCMB(data, targets, threshold):
    #data is training data without label column
    numfeat = data.shape[1]
    subsize = 100
    count = 0
    Feat = []
    while count * subsize <= numfeat:
        if (count + 1) * subsize <= numfeat:
            sub_D = data[:, count * subsize:subsize + count * subsize]
            results = tian_IPCMB(sub_D, targets, threshold)
            index = results[0] + count * subsize
            Feat = set(Feat).union(set(index))
        else:
            sub_D = data[:, count * subsize:]
            results = tian_IPCMB(sub_D, targets, threshold)
            index = results[0] + count * subsize
            Feat = set(Feat).union(set(index))
        count = count + 1

    Feat = list(Feat)  #convert set object to list
    cmbVector = joint(data[:, Feat])
    for i in np.setdiff1d(np.arange(numfeat), Feat):
        temp = cmi(data[:, i], targets, cmbVector)
        if temp > threshold:
            Feat.append(i)

    MB = Feat
    return np.array(MB)
Пример #3
0
def sort_by_cmi(feat_indices, targets, cond_indices, data):
    """
    Returns the indices found in 'feat_indices' in order of I(X;Y|Z), where Z 
      is the joint distribution described by data[cond_indices], X is the joint
      distribution of data[feat_indices[i]], and Y is the joint distribution of 
      data[targets]. If Z is empty, then the result is I(X;Y) 
  """
    feats_to_cmi = dict()
    if (cond_indices.size == 0):
        for feature in feat_indices:
            feats_to_cmi[feature] = mi(data[:, feature], targets)
    else:
        for feature in feat_indices:
            feats_to_cmi[feature] = cmi(data[:, feature], targets,
                                        joint(data[:, cond_indices]))

    sorted_features = np.array(
        sorted(feat_indices, key=lambda f: -feats_to_cmi[f]))
    return sorted_features
def RecognizePC(targets, ADJt, data, THRESHOLD, NumTest):
    MIs = []
    NonPC = []
    cutSetSize = 0
    data_check = 1
    #targets = data[:, T]
    Sepset = [[]] * data.shape[1]
    seperators = [[]] * data.shape[1]
    #% Search
    datasizeFlag = 0
    while ADJt.size > cutSetSize:
        for xind in range(0, ADJt.size):  # for each x in ADJt
            X = ADJt[xind]
            if cutSetSize == 0:
                NumTest = NumTest + 1
                TEMP = mi(data[:, X], targets, 0)
                MIs.append([TEMP])  #compute mutual information
                #print("Vertex MI ",X,TEMP)
                if TEMP <= THRESHOLD:
                    NonPC.append(X)
            elif cutSetSize == 1:
                Diffx = np.setdiff1d(ADJt, X)
                C = list(combinations(Diffx, cutSetSize))
                for sind in range(0, len(C)):  # for each S in ADJT\x, size
                    S = np.array(list(C[sind]))
                    cmbVector = joint(data[:, S])
                    if data_check:
                        datasizeFlag = checkDataSize(data[:, X], targets,
                                                     cmbVector)
                    if datasizeFlag != 1:
                        NumTest = NumTest + 1
                        TEMP = cmi(data[:, X], targets, cmbVector, 0)
                        MIs.append([TEMP])
                        if TEMP <= THRESHOLD:
                            NonPC = set(NonPC).union(set([X]))
                            Sepset[X] = set(Sepset[X]).union(set(S))
                            break
                    else:
                        break
            else:  # set size > 1
                Diffx = np.setdiff1d(ADJt, X)
                C = list(combinations(Diffx, cutSetSize - 1))
                midBreakflag = 0
                for sind in range(0, len(C)):  # for each S in ADJT\x, size
                    S = np.array(list(C[sind]))
                    RestSet = np.setdiff1d(Diffx, S)
                    for addind in range(0, RestSet.size):
                        col = set(S).union(set([RestSet[addind]]))
                        cmbVector = joint(data[:, np.array(list(col))])
                        if data_check:
                            datasizeFlag = checkDataSize(
                                data[:, X], targets, cmbVector)
                        if datasizeFlag != 1:
                            NumTest = NumTest + 1
                            TEMP = cmi(data[:, X], targets, cmbVector, 0)
                            MIs.append([TEMP])
                            if TEMP <= THRESHOLD:
                                NonPC = set(NonPC).union(set([X]))
                                # Line has an error
                                Sepset[X] = set(Sepset[X]).union(
                                    set(S), set([RestSet[addind]]))
                                midBreakflag = 1
                                break
                        else:
                            break
                    if midBreakflag == 1:
                        break
        if len(NonPC) > 0:
            ADJt = np.setdiff1d(ADJt, np.array(list(NonPC)))
            cutSetSize = cutSetSize + 1
            # print("NonPC")
            # print(NonPC)
            # print(len(NonPC))
            NonPC = []
        elif datasizeFlag == 1:
            break
        else:
            cutSetSize = cutSetSize + 1

    ADJ = ADJt

    result = []
    result.append(ADJ)
    result.append(Sepset)
    result.append(NumTest)
    result.append(cutSetSize)
    result.append(MIs)

    return result
Пример #5
0
def CMI_adaptive_pure_soft(X, Y, cond_set, hm_HypoTest):
    cond_mi = 0
    if (len(cond_set.shape) == 1):
        cond_set = cond_set.reshape((cond_set.size,1))
    if (cond_set.size == 0):
        results = MI_adaptive_soft(X, Y, hm_HypoTest)
        cond_mi = results[0]
        hm_HypoTest = results[1]
        
        results = []
        results.append(cond_mi)
        results.append(hm_HypoTest)
        return results
    
    naive_cmi = cmi(X, Y, cond_set)
    if naive_cmi == 0:
        results = []
        results.append(cond_mi)
        results.append(hm_HypoTest)
        return results
    
    Cx, X = np.unique(X, return_inverse = True)
    Cy, Y = np.unique(Y, return_inverse = True)
    
    m = len(Cx)
    n = len(Cy)
    
    hm_sample, hm_condvar = cond_set.shape
    entire_uniform = 1
    
    if hm_condvar == 1:
        combo_set = np.unique(cond_set)
        j = []
        for i in range(combo_set.shape[0]):
            pattern = combo_set[i]
            sub_cond_idx = np.argwhere(cond_set==pattern).T[0]
            p_cond = len(sub_cond_idx) / hm_sample
            
            sub_cond_idx = np.array(sub_cond_idx)
            results = MI_adaptive_soft(X[sub_cond_idx], Y[sub_cond_idx], hm_HypoTest)
            temp_mi = results[0]
            hm_HypoTest = results[2]
            
            if temp_mi == np.inf:
                temp_mi = 0
            else:
                entire_uniform = 0
            cond_mi = cond_mi + p_cond*temp_mi
    else:
        var_1 = cond_set[:,1]
        var_2 = joint(cond_set[:, 2:])
        
        C1,var_1 = np.unique(X, return_inverse = True)
        C2,var_2 = np.unique(Y, return_inverse = True)
        #C1 = np.unique(X, return_inverse = True)
        #C2 = np.unique(Y, return_inverse = True)
        
        p = len(C1)
        q = len(C2)
        
        joint_set, hm_HypoTest, isUniform = jointPDFAdapPartition(var_1, var_2, p, q, hm_HypoTest)
        
        for j in range(p):
            for k in range(q):
                get_indexes = lambda x, xs: [i for (y, i) in zip(xs, range(len(xs))) if y == x]
                index = get_indexes(C1[j], var_1)
                index = np.array(index)
                sub_cond_idx = get_indexes(C2[k], var_2[index])
                sub_cond_idx = np.array(sub_cond_idx)
                sub_cond_idx = sub_cond_idx.astype(int)
                p_cond = len(sub_cond_idx) / hm_sample
                if len(sub_cond_idx) == 0:
                    temp_mi = 0
                else:
                    results = MI_adaptive_soft(X[sub_cond_idx], Y[sub_cond_idx], hm_HypoTest)
                    temp_mi = results[0]
                    hm_HypoTest = results[2]
                if temp_mi == np.inf:
                    temp_mi = 0
                else:
                    entire_uniform = 0
                
                cond_mi = cond_mi + p_cond*temp_mi
    
    if entire_uniform:
        cond_mi = np.inf
    return cond_mi, hm_HypoTest
Пример #6
0
def tian_STMB_new(train_data, targets, threshold = 0.02):
    NumTest = 0   
    numf = train_data.shape[1]  # feature number
    #targets = data[:, targetindex]    # selected index data 
    # %% Recognize Target PC
    CanMB = np.arange(numf)    # candidates
    
    Results = RecognizePC(targets, CanMB, train_data, threshold, NumTest)
    PCD = Results[0]
    Sepset_t = Results[1]
    NumTest = Results[2]
    cutSetSize = Results[3]
    spouse = [[]]*numf
    #print("===========PC Result==========")
    #print(PCD)
    # print(Sepset_t)
    # print(cutSetSize)
    #scores = []
    Des = [[]]*PCD.size
    datasizeFlag = 0
    #%% Find Markov blanket
    for yind in range(PCD.size):
        flag = 0
        y = PCD[yind]
        searchset = np.setdiff1d(CanMB, PCD)
        
        for xind in range(searchset.size):
            x = searchset[xind]
            col = set(Sepset_t[x]).union(set([y]))
            cmbVector = joint(train_data[:, np.array(list(col))])
            datasizeFlag = checkDataSize(train_data[:, x], targets, cmbVector)
            #print("datasizeFlag",x,datasizeFlag)
            if datasizeFlag != 1:
                NumTest = NumTest + 1
                T = cmi(train_data[:, x], targets, cmbVector, 0)
                #print("CMI",y,x,T)
                if T > threshold:                    # v structure             
                    for s in np.setdiff1d(np.union1d(PCD,[x]), np.array([y])): 
                        T = cmi(train_data[:, y], targets, train_data[:, s], 0)
                        #print("Vertex CMI",s,y,x,T)
                        if T < threshold:
                            temp = set(Des[yind]).union(set([y]))
                            Des[yind] = np.array(list(temp))
                            flag = 1
                            break
                        else:
                            temp = set(spouse[y]).union(set([x]))
                            spouse[y]= np.array(list(temp))

            if flag == 1:                            
               break
    
    des = [item for sublist in Des for item in sublist]
    PCD = np.setdiff1d(PCD, des)
    #print(PCD)
    #assert(1==2)
    #%% Shrink spouse
    NonS = []
    S = []
    for i in np.setdiff1d(np.arange(numf), PCD):
        spouse[i] = []   # empty                                     

    for y in np.arange(len(spouse)):
        if spouse[y] != []:
           S.append( y)    # Y has spouses
           # shrink
           spousecan = spouse[y]
           for sind in np.arange(spousecan.size):
               s = spousecan[sind]
               col = set([y]).union(set(spousecan),set(PCD))
               cmbVector = joint(train_data[:, np.setdiff1d(np.array(list(col)), s)])
               datasizeFlag = checkDataSize(train_data[:, s], targets, cmbVector)
               if datasizeFlag != 1:
                  NumTest = NumTest + 1
                  T = cmi(train_data[:, s], targets, cmbVector, 0)
                  if T < threshold:
                     NonS = set(NonS).union(set([s]))
           spouse[y] = np.setdiff1d(spousecan, np.array(list(NonS)))
           NonS = []
                                                            
    b = []
    for i in range(len(spouse)):
        if len(spouse[i]) > 0:
            b = set(b).union(set(spouse[i]))
    # remove false spouse from PC
    M = PCD       # setdiff(PCD,S); % M has no spouses in PCD set
    PCsize = M.size
    testSet = set(S).union(set(b))
    #testSet = np.array(list(temp))
    C = np.zeros(shape = (PCsize, 1))
    for x in M:
       col = set(PCD).union(set(testSet))
       cmbVector = joint(train_data[:, np.setdiff1d(np.array(list(col)), x)])
       datasizeFlag = checkDataSize(train_data[:, x], targets, cmbVector)
       if datasizeFlag != 1:
            NumTest = NumTest + 1
            T = cmi(train_data[:, x], targets, cmbVector, 0)
            if T < threshold:
               PCD = np.setdiff1d(PCD, x)                                                                      
    PCsize2 =np.mean(C)
    MB = set(PCD).union(set(b))
    
    result = []
    result.append(np.array(list(MB)))
    result.append(PCD)
    result.append(spouse)
    result.append(NumTest)
    result.append(Sepset_t)
    result.append(cutSetSize)
    result.append(PCsize)
    result.append(PCsize2)
    
    return result
Пример #7
0
def tian_IPCMB(
    train_data, target, threshold
):  #train_data is not including targets, targets is the label vector
    NumTest = 0
    numSample = train_data.shape[0]
    numf = train_data.shape[1]  # do not include the target
    CanMB = np.arange(numf)
    #target = target.reshape([numSample,1])
    Results = RecognizePC(target, CanMB, train_data, threshold, NumTest)

    PC = Results[0]
    Sepset_t = Results[1]
    NumTest = Results[2]
    #cutSetSize = Results[3]

    MB = PC
    #association = []

    #Recognize a true positive, and its PC as spouse candidate
    children = []
    targetindex = 0

    for xind in np.arange(len(PC)):
        X = PC[xind]
        CanADJX = np.arange(numf)
        rest_idx = np.setdiff1d(np.arange(numf), X)  #numf-1
        temp_trainD = np.hstack((target, train_data[:, rest_idx]))
        Results = RecognizePC(train_data[:, X], CanADJX, temp_trainD,
                              threshold, NumTest)
        temp_CanSP = Results[0]
        NumTest = Results[2]

        if ~np.in1d(targetindex, temp_CanSP):
            MB = np.setdiff1d(MB, X)
            continue

        temp_idx = np.where(temp_CanSP != 0)
        CanSP = temp_CanSP[temp_idx]
        temp_idx = np.where(CanSP <= X)
        CanSP[temp_idx] = CanSP[temp_idx] - 1

        # recognize true positives
        DiffY = np.setdiff1d(CanSP, MB)  # in CanSP but not in MB
        DiffY = np.setdiff1d(DiffY, X)  # X should not in Sepset

        for yind in np.arange(len(DiffY)):
            Y = DiffY[yind]
            SepsetTY = Sepset_t[Y]
            cmbVector = joint(train_data[:,
                                         list(set(SepsetTY).union(set([X])))])
            NumTest = NumTest + 1
            if cmi(train_data[:, Y], target, cmbVector, 0) > threshold:
                children = set(children).union(set([X]))
                children = list(children)
                MB = set(MB).union(set([Y]))
                MB = list(MB)

    result = []
    result.append(np.array(MB))
    result.append(PC)
    result.append(NumTest)
    result.append(children)

    return result