示例#1
0
def train(protein,clf,dim_red=None,encoding=False,single_frame=False):
    '''
    Args:
        - protein: "bpti" or "alanine", name of the protein
        - clf: classifier to use
        - dim_red: int or None. 
                    if not None, it's the reduced dimension for PCA 
        - encode: if False, classify on coordinates; 
                    if True, classify on encoded hidden states
        - single_frame: classify as single frames or sequences. 
    '''
    
    # get features
    if encoding==False:
        # get flattened coordinates of each frame
        traj = dr.load_traj(protein)
        coords = traj.xyz
        coords = np.reshape(coords,(len(coords),-1))
        X = coords      
#        print coords  
    else:
        # get encoded hidden states
        X = encode()
    
    # use PCA to reduce dimension
    if dim_red != None:
        pca = PCA(dim_red)
        X = pca.fit(X).transform(X)
#        print X[0]

    # get labels
    with open("/output/"+protein+"/labels"+suffix,"r") as lb:
#    with open("/protein/data/"+protein+"-labels"+suffix) as lb:
        Y = np.asarray(pickle.load(lb))
    if encoding==True and single_frame==True:
        # need to match the number of sequences 
        end = -seq_size*window_size+1
        if end == 0:
            end = Y.size
        Y = Y[0:end:sliding*window_size] # compare with the first frame
#        Y = Y[-end::sliding*window_size] # compare with the last frame
#  print Y

    # training and cross validation
    data_scores = cross_val_score(clf,X,Y,cv=int(fold))
    print("Accuracy with %s folds: %0.2f (+/- %0.2f)" % 
        (fold, data_scores.mean(), data_scores.std()))
示例#2
0
    centers = np.array([[55,48],[-77,138],[-77, -39],[60, -72]])*np.pi/180.0
    clu = kmeans(n_clusters=4,init=centers)
#    labels = clu.fit_predict(dihedrals)
    labels = clu.fit_predict(np.transpose(dihedrals_shift))
    labels = 
    print("centers:")
    print(clu.cluster_centers_*180.0/np.pi)
    return labels

if __name__ == "__main__":

    task = int(sys.argv[1])

    if task==1:
        # do clustering and save the label file
        traj = dr.load_traj(protein)
#        labels = k_means(traj,num_states,dim_red=10)
        labels = hierarchy(traj,num_states)
        with open("/output/"+protein+"/labels"+suffix,"w") as lb:
            pickle.dump(labels,lb)

    elif task==2:
        # read the label file and get the indices of one cluster
#        with open("/output/"+protein+"/labels"+suffix,"r") as lb:
        with open("/protein/data/"+protein+"-labels"+suffix) as lb:
            labels = pickle.load(lb)
#        for l in labels:
#            print(l)    

    elif task==3:
        # cluster the sequences