Пример #1
0
def fold10_cv_partition(positive_file, negative_file):
    # generate the vectorized xs and ys
    x_dataset, y_dataset = vectorize_data(positive_file, negative_file)
    print("data vectorization in function fold10_cv_partition finished!")
    # Divided into 10 parts
    max_len = len(x_dataset)
    m = int(max_len / 10)
    # define lists storing the different partitions
    x_train_segment = []
    y_train_segment = []
    x_validation_segment = []
    y_validation_segment = []

    # Partition of data set for 10-fold cross validation
    # generate the first nine dataset segments
    for i in range(9):
        x_validation_segment.append(x_dataset[m * i:m * (i + 1)])
        y_validation_segment.append(y_dataset[m * i:m * (i + 1)])
        x_train_segment.append(
            np.concatenate(
                [x_dataset[0:m * i], x_dataset[m * (i + 1):max_len]]))
        y_train_segment.append(
            np.concatenate(
                [y_dataset[0:m * i], y_dataset[m * (i + 1):max_len]]))
    # generate the last dataset segments
    x_validation_segment.append(x_dataset[m * 9:max_len])
    y_validation_segment.append(y_dataset[m * 9:max_len])
    x_train_segment.append(x_dataset[0:m * 9])
    y_train_segment.append(y_dataset[0:m * 9])
    print("Partition of fold 10 finished!")
    return x_train_segment, y_train_segment, x_validation_segment, y_validation_segment
Пример #2
0
def train_test_partition(positive_file, negative_file):
    # generate the vectorized xs and ys
    x_dataset, y_dataset = vectorize_data(positive_file, negative_file)
    print("data vectorization in function train_test_partition finished!")
    print(len(x_dataset))
    # generate test and train dataset
    x_test_dataset = x_dataset[0:752]
    y_test_dataset = y_dataset[0:752]
    x_train_dataset = x_dataset[752:]
    y_train_dataset = y_dataset[752:]
    print("train_test_partition finished!")
    return x_train_dataset, y_train_dataset, x_test_dataset, y_test_dataset
 # CONV_DEEP = 128   #number of first filter(convolution deepth)
  
STRIDES = [1,1,1,1]  #the strid in each of four dimensions during convolution
KSIZE = [1,164,1,1]    #pooling window size

FC_SIZE = 128     #nodes of full-connection layer
NUM_CLASSES = 2   # classification number
 
DROPOUT_KEEP_PROB = 0.5   #keep probability of dropout



FILE_PATH = "../../data/miRBase_set.csv"
FILE_PATH_PUTATIVE = "../../data/putative_mirtrons_set.csv"
all_data_array = dataRead.read_data(FILE_PATH,FILE_PATH_PUTATIVE)
vectorized_dataset = dataVectorization.vectorize_data(all_data_array)
X_train, y_train, X_test, y_test = dataPartition.data_partition(vectorized_dataset)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print(y_train.shape)
print(X_train.shape)
print(y_test.shape)
print(X_test.shape)
print("dataset vectorization finished!")
print("iteration",TRAINING_ITER)
dataset_size = len(X_train)  #number of training dataset

input_X = tf.placeholder(tf.float32,[None,SEQUENCE_LENGTH,EMBEDDING_SIZE,1])
input_y = tf.placeholder(tf.float32,[None, NUM_CLASSES])