Пример #1
0
def zero_one_normal(tar=None, out=None, ref=None):
    '''
    tar: the target file for zero one normalization
    out: the output file after zero one normalization
    ref: the reference file. If reference file is 'None', 
         then zero one normalization will be done based on 
         target file itself.
    '''

    if ref=='None':
        tar_data=PCLfile(tar, skip_col = 0)
        tar_data.zero_one_normalization()
        tar_data.write_pcl(out)
    else:
        ref_data = PCLfile(ref, skip_col=0)
        tar_data = PCLfile(tar, skip_col=0)
        for i in xrange(ref_data.data_matrix.shape[0]): 
            row_minimum = ref_data.data_matrix[i,:].min()
            row_maximum = ref_data.data_matrix[i,:].max()
            row_range = row_maximum - row_minimum
            tar_data.data_matrix[i,:] = (tar_data.data_matrix[i,:] - row_minimum)/row_range
            #bound the values to be between 0 and 1
            tar_data.data_matrix[i,:] = [0 if x< 0 else x for x in tar_data.data_matrix[i,:]] 
            tar_data.data_matrix[i,:] = [1 if x> 1 else x for x in tar_data.data_matrix[i,:]]
        tar_data.write_pcl(out)
Пример #2
0
def find_enriched_node(geneList_folder = None, data_file= None, gold_std = None, out_file = None):
    '''
    geneList_folder: the folder stores high-weight gene files for each node
    data_file: the microarray file with gene genes
    gold_std: gold standard file contains a list of genes
    out_file: the output file that stores each node and its corresponding q value
    '''

    datasets = PCLfile(data_file, skip_col=0)
    gene_id = datasets.id_list  

    gold_fh = open(gold_std,'r')
    gold_set = []
    for line in gold_fh:
        gene = line.strip().split('\t')[0]
        gold_set.append(gene)

    p_all_node = []
    geneList_files =  glob.glob(geneList_folder + '/Node*.txt') #Get all the high-weight gene files under the geneList_folder
    for i in xrange(len(geneList_files)):
        gene_fh = open(geneList_folder + '/Node'+str(i+1)+'.txt','r')
        gene_fh.next()
        geneset = [] #geneset stores the high-weight gene for a node
        for line in gene_fh:
            gene = line.strip().split('\t')[0]
            geneset.append(gene)
        #Build the contengency table
        all_overlap_genes = set(gold_set).intersection(set(gene_id))
        selected_overlap_genes = set(gold_set).intersection(set(geneset))
        a = len(selected_overlap_genes)
        b = len(all_overlap_genes) - len(selected_overlap_genes)
        c = len(geneset) - len(selected_overlap_genes)
        d = len(gene_id) -a -b -c
        table = [[a,b],[c,d]]
        #Calculate p-value using fisher exact test
        oddsratio, pvalue = stats.fisher_exact(table)
        p_all_node.append(pvalue)   

    #multiple hypothesis correction
    result_adj_pvalue = multipletests(p_all_node, alpha=0.05, method='fdr_bh')[1] 
    all_node = ['Node'+str(x+1) for x in xrange(50)]
    #find the node with lowest q value, write the output
    qvalue_small = 1 
    node_small = None
    out_fh = open(out_file, 'w')
    for node, qvalue in zip(all_node, result_adj_pvalue):
        out_fh.write(node+'\t'+str(qvalue)+'\n')
        if qvalue < qvalue_small:
            qvalue_small = qvalue
            node_small = node
    print node_small+' is most significantly associated with this gene set with a q value of '+str(qvalue_small)
Пример #3
0
def read_weight_matrix(data_file, network_file):
    '''
    This function reads weight matrix from network structure file
    and the corresponding gene id from data file
    '''
    datasets = PCLfile(data_file, skip_col=0)
    gene_id = datasets.id_list
    network_fh = open(network_file, 'r')
    input_size = len(gene_id)
    network_fh.next()  # skip the layer count line
    network_fh.next()  # skip 'weight matrix' line
    W = []
    input_count = 0
    for line in network_fh:
        line = line.strip().split('\t')
        W.append(line)
        input_count += 1
        if input_count == input_size:
            break
    W = numpy.array(W, dtype=float)
    return gene_id, W
Пример #4
0
def train_SdA(training_epochs=15,
              train_lr=0.001,
              data_file=None,
              skip_col=2,
              batch_size=1,
              random_seed_1=89677,
              random_seed_2=123,
              net_structure=[1000, 1000, 1000],
              corruption_levels=[.1, .2, .3],
              output_file=None,
              net_file=None):

    logging.basicConfig(filename=output_file.replace('activity_SdA.txt',
                                                     'SdA.log'),
                        level=logging.INFO)
    logging.info('Training the dataset:' + data_file)
    logging.info('The structure of the networks:' + str(net_structure))
    logging.info('Training epoches:' + str(training_epochs) + '\t' +
                 'Batch size:' + str(batch_size) + '\t' + 'Learning rate:' +
                 str(train_lr) + '\t' + 'Corruption levels:' +
                 str(corruption_levels) + '\n' + 'Random seed for training:' +
                 str(random_seed_1) + '\t' +
                 'Ramdom seed for permuting sample order:' +
                 str(random_seed_2))

    datasets = PCLfile(data_file, skip_col)
    train_set_x, sample_id = datasets.get_permuted_sample(
        seed=random_seed_2)  #Permute the order of samples using random_seed_2
    print '... finish reading the data'

    train_set_x = theano.shared(train_set_x, borrow=True)

    # compute number of minibatches for training
    train_size = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches = train_size / batch_size

    # numpy random generator
    numpy_rng = numpy.random.RandomState(random_seed_1)

    # the number of input nodes
    input_node = len(datasets.id_list)

    print '... building the model'
    # construct the stacked denoising autoencoder class
    sda = SdA(numpy_rng=numpy_rng,
              n_ins=input_node,
              hidden_layers_sizes=net_structure)

    #########################
    # TRAINING THE MODEL #
    #########################
    print '... getting the training functions'
    training_fns = sda.training_functions(train_set_x=train_set_x,
                                          batch_size=batch_size)

    print '... training the model'
    start_time = time.clock()
    ## Train layer-wise
    corruption_levels = corruption_levels
    for i in xrange(sda.n_layers):
        # go through training epochs
        for epoch in xrange(training_epochs):
            # go through the training set
            c = []
            for batch_index in xrange(n_train_batches):
                c.append(training_fns[i](index=batch_index,
                                         corruption=corruption_levels[i],
                                         lr=train_lr))
            print 'Training layer %i, epoch %d, cost ' % (i, epoch),
            print numpy.mean(c)
            logging.info('Training layer %i, epoch %d, cost %f ' %
                         (i, epoch, numpy.mean(c)))

    end_time = time.clock()

    logging.info('The training code for file ' + os.path.split(__file__)[1] +
                 ' ran for %.2fm' % ((end_time - start_time) / 60.))
    print '... training finished.'

    ##############################################################
    # Return the final activity value and raw activity value
    # for each node of each input sample
    ##############################################################
    output_fh = open(output_file, 'w')
    raw_output_fh = open(output_file.replace('activity', 'rawActivity'), 'w')
    each_layer_output = sda.return_activity(train_set_x=train_set_x)
    each_layer_raw_output = sda.return_raw_activity(train_set_x=train_set_x)
    for i in xrange(sda.n_layers):
        output_fh.write('layer %i \n' % (i + 1))
        raw_output_fh.write('layer %i \n' % (i + 1))
        for train_sample in xrange(train_size):
            node_activation = each_layer_output[i](train_sample)
            node_raw_activation = each_layer_raw_output[i](train_sample)
            output_fh.write(sample_id[train_sample] + '\t')
            raw_output_fh.write(sample_id[train_sample] + '\t')
            numpy.savetxt(output_fh,
                          node_activation,
                          fmt='%.8f',
                          delimiter='\t')
            numpy.savetxt(raw_output_fh,
                          node_raw_activation,
                          fmt='%.8f',
                          delimiter='\t')

    ##############################################################
    # Return weight matrix and bias vectors of the final network #
    ##############################################################
    net_file = open(net_file, 'w')
    weight_output, bias_output, bias_prime_output = sda.return_network()
    for i in xrange(len(weight_output)):
        net_file.write('layer %i \n' % (i + 1))
        net_file.write('weight matrix \n')
        numpy.savetxt(net_file, weight_output[i], fmt='%.8f', delimiter='\t')
        net_file.write('hidden bias vector \n')
        numpy.savetxt(net_file, bias_output[i], fmt='%.8f', delimiter='\t')
        net_file.write('visible bias vector \n')
        numpy.savetxt(net_file,
                      bias_prime_output[i],
                      fmt='%.8f',
                      delimiter='\t')
Пример #5
0
def SdA_test(data_file, skip_col, network_file, net_structure):

    activity_file = data_file.replace(
        '.pcl', '_activity') + '_with_' + network_file.split('/')[-1]
    raw_activity_file = data_file.replace(
        '.pcl', '_rawActivity') + '_with_' + network_file.split('/')[-1]
    datasets = PCLfile(data_file, skip_col)

    input_data = datasets.get_sample()
    input_data = numpy.matrix(input_data)
    sample_id = datasets.sample_list

    network_fh = open(network_file, 'r')
    input_size = input_data.shape[1]  #input_size is the number of genes
    layer_para = []
    for each_layer in xrange(len(net_structure)):
        network_fh.next()  # skip the layer count line
        network_fh.next()  # skip 'weight matrix' line

        #Get the weight matrix
        W = []
        input_count = 0
        for line in network_fh:
            line = line.strip().split('\t')
            W.append(line)
            input_count += 1
            if input_count == input_size:  #when it reach the number of genes
                break
        W = numpy.matrix(W, dtype=float)
        network_fh.next()  # skip 'hidden bias vector' line

        #Get the bias vector of hidden layer
        h_bias = []
        output_count = 0
        for line in network_fh:
            line = line.strip().split('\t')
            h_bias.append(line)
            output_count += 1
            if output_count == int(
                    net_structure[each_layer]
            ):  #when it reach the number of nodes in hidden layer
                break
        h_bias = numpy.matrix(h_bias, dtype=float)
        network_fh.next()  # skip 'visible bias vector' line

        #Get the bias vector of visible output layer
        v_bias = []
        input_count = 0
        for line in network_fh:
            v_bias.append(line)
            input_count += 1
            if input_count == input_size:
                break
        v_bias = numpy.matrix(v_bias, dtype=float)

        layer_para.append(
            (W, h_bias)
        )  #Weight matrix and hidden bias vector are enough for calculating activities
        input_size = net_structure[each_layer]

    activity_fh = open(activity_file, 'w')
    raw_activity_fh = open(raw_activity_file, 'w')
    for each_layer in xrange(len(net_structure)):
        activity_fh.write('layer %i \n' % (each_layer + 1))
        raw_activity_fh.write('layer %i \n' % (each_layer + 1))
        temp = input_data * layer_para[each_layer][
            0]  #calculate raw activity, before sigmoid transformation
        output = logit(temp + layer_para[each_layer][1].T
                       )  #calculate activity, after sigmoid transformation
        for each_sample in xrange(output.shape[0]):
            activity_fh.write(sample_id[each_sample] + '\t')
            raw_activity_fh.write(sample_id[each_sample] + '\t')
            numpy.savetxt(activity_fh,
                          output[each_sample, ],
                          fmt='%.8f',
                          delimiter='\t')
            numpy.savetxt(raw_activity_fh,
                          temp[each_sample, ],
                          fmt='%.8f',
                          delimiter='\t')
        input_data = output
Пример #6
0
def zero_one_normal(tar=None, out=None, ref=None):
    '''
    tar: the target file for zero one normalization
    out: the output file after zero one normalization
    ref: the reference file. If reference file is 'None',
         then zero one normalization will be done based on
         target file itself.
    '''

    if ref == 'None':
        tar_data = PCLfile(tar, skip_col=0)
        tar_data.zero_one_normalization()
        tar_data.write_pcl(out)
    else:
        ref_data = PCLfile(ref, skip_col=0)
        tar_data = PCLfile(tar, skip_col=0)
        for i in xrange(ref_data.data_matrix.shape[0]):
            row_minimum = ref_data.data_matrix[i, :].min()
            row_maximum = ref_data.data_matrix[i, :].max()
            row_range = row_maximum - row_minimum
            tar_data.data_matrix[i, :] =\
                (tar_data.data_matrix[i, :] - row_minimum)/row_range
            # bound the values to be between 0 and 1
            tar_data.data_matrix[i, :] =\
                [0 if x < 0 else x for x in tar_data.data_matrix[i, :]]
            tar_data.data_matrix[i, :] =\
                [1 if x > 1 else x for x in tar_data.data_matrix[i, :]]
        tar_data.write_pcl(out)
Пример #7
0
def SdA_test(data_file, skip_col, network_file, net_structure):

    activity_file = data_file.replace('.pcl','_activity') + '_with_' + network_file.split('/')[-1] 
    raw_activity_file = data_file.replace('.pcl','_rawActivity') + '_with_' + network_file.split('/')[-1] 
    datasets = PCLfile(data_file, skip_col)

    input_data = datasets.get_sample()
    input_data = numpy.matrix(input_data)
    sample_id = datasets.sample_list    

    network_fh = open(network_file,'r') 
    input_size = input_data.shape[1] #input_size is the number of genes     
    layer_para = []
    for each_layer in xrange(len(net_structure)):
        network_fh.next() # skip the layer count line
        network_fh.next() # skip 'weight matrix' line
        
        #Get the weight matrix
        W = []
        input_count = 0
        for line in network_fh:
            line = line.strip().split('\t')
            W.append(line)
            input_count += 1
            if input_count == input_size: #when it reach the number of genes
                break
        W = numpy.matrix(W, dtype= float)
        network_fh.next() # skip 'hidden bias vector' line

        #Get the bias vector of hidden layer
        h_bias = []
        output_count = 0
        for line in network_fh:
            line = line.strip().split('\t')
            h_bias.append(line)
            output_count += 1
            if output_count == int(net_structure[each_layer]): #when it reach the number of nodes in hidden layer
                break
        h_bias = numpy.matrix(h_bias, dtype = float)
        network_fh.next() # skip 'visible bias vector' line

        #Get the bias vector of visible output layer
        v_bias = []
        input_count = 0
        for line in network_fh:
            v_bias.append(line)
            input_count += 1
            if input_count == input_size:
                break
        v_bias = numpy.matrix(v_bias, dtype = float)

        layer_para.append((W,h_bias)) #Weight matrix and hidden bias vector are enough for calculating activities
        input_size = net_structure[each_layer]  


    activity_fh = open(activity_file, 'w')
    raw_activity_fh = open(raw_activity_file,'w')
    for each_layer in xrange(len(net_structure)):
        activity_fh.write('layer %i \n' %(each_layer+1))
        raw_activity_fh.write('layer %i \n' %(each_layer+1))
        temp = input_data * layer_para[each_layer][0]  #calculate raw activity, before sigmoid transformation
        output = logit(temp+ layer_para[each_layer][1].T) #calculate activity, after sigmoid transformation
        for each_sample in xrange(output.shape[0]):
            activity_fh.write(sample_id[each_sample]+'\t')
            raw_activity_fh.write(sample_id[each_sample]+'\t')
            numpy.savetxt(activity_fh, output[each_sample,], fmt= '%.8f', delimiter= '\t')
            numpy.savetxt(raw_activity_fh, temp[each_sample,], fmt= '%.8f', delimiter= '\t')
        input_data = output 
Пример #8
0
def train_SdA(training_epochs=15, train_lr=0.001, data_file = None, skip_col = 2,
              batch_size=1, random_seed_1 = 89677, random_seed_2 = 123,net_structure = [1000,1000,1000], 
              corruption_levels = [.1, .2, .3], output_file = None, net_file = None):


    logging.basicConfig(filename = output_file.replace('activity_SdA.txt', 'SdA.log'), level= logging.INFO)
    logging.info('Training the dataset:' + data_file)
    logging.info('The structure of the networks:'+ str(net_structure))
    logging.info('Training epoches:'+str(training_epochs)+'\t'+'Batch size:'+str(batch_size)+'\t'+'Learning rate:'+str(train_lr)+'\t'+'Corruption levels:'+str(corruption_levels)+'\n'
        +'Random seed for training:'+str(random_seed_1)+'\t'+ 'Ramdom seed for permuting sample order:'+str(random_seed_2))
    
    datasets = PCLfile(data_file, skip_col)    
    train_set_x, sample_id = datasets.get_permuted_sample(seed = random_seed_2)#Permute the order of samples using random_seed_2
    print '... finish reading the data'

    train_set_x = theano.shared(train_set_x,borrow=True)

    # compute number of minibatches for training
    train_size = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches = train_size / batch_size

    # numpy random generator
    numpy_rng = numpy.random.RandomState(random_seed_1)

    # the number of input nodes
    input_node = len(datasets.id_list)

    print '... building the model'
    # construct the stacked denoising autoencoder class
    sda = SdA(numpy_rng=numpy_rng, n_ins= input_node,
              hidden_layers_sizes= net_structure)

    #########################
    # TRAINING THE MODEL #
    #########################
    print '... getting the training functions'
    training_fns = sda.training_functions(train_set_x=train_set_x,
                                          batch_size=batch_size)

    print '... training the model'
    start_time = time.clock()
    ## Train layer-wise
    corruption_levels = corruption_levels
    for i in xrange(sda.n_layers):
        # go through training epochs
        for epoch in xrange(training_epochs):
            # go through the training set
            c = []
            for batch_index in xrange(n_train_batches):
                c.append(training_fns[i](index=batch_index,
                         corruption=corruption_levels[i],
                         lr=train_lr))
            print 'Training layer %i, epoch %d, cost ' % (i, epoch),
            print numpy.mean(c)
            logging.info('Training layer %i, epoch %d, cost %f ' % (i, epoch, numpy.mean(c) ))

    end_time = time.clock()

    logging.info('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
    print '... training finished.'

    ##############################################################
    # Return the final activity value and raw activity value
    # for each node of each input sample 
    ##############################################################
    output_fh = open(output_file,'w')
    raw_output_fh = open(output_file.replace('activity','rawActivity'),'w')
    each_layer_output = sda.return_activity(train_set_x=train_set_x)
    each_layer_raw_output = sda.return_raw_activity(train_set_x=train_set_x)
    for i in xrange(sda.n_layers):
        output_fh.write('layer %i \n' %(i+1))
        raw_output_fh.write('layer %i \n' %(i+1))
        for train_sample in xrange(train_size):
            node_activation = each_layer_output[i](train_sample)
            node_raw_activation = each_layer_raw_output[i](train_sample)
            output_fh.write(sample_id[train_sample]+'\t')
            raw_output_fh.write(sample_id[train_sample]+'\t')
            numpy.savetxt(output_fh, node_activation, fmt= '%.8f', delimiter= '\t') 
            numpy.savetxt(raw_output_fh, node_raw_activation, fmt= '%.8f', delimiter= '\t') 


    ##############################################################
    # Return weight matrix and bias vectors of the final network #
    ##############################################################
    net_file = open(net_file,'w')
    weight_output, bias_output, bias_prime_output = sda.return_network()
    for i in xrange(len(weight_output)):
        net_file.write('layer %i \n' %(i+1))
        net_file.write('weight matrix \n')
        numpy.savetxt(net_file, weight_output[i], fmt= '%.8f', delimiter = '\t') 
        net_file.write('hidden bias vector \n')
        numpy.savetxt(net_file, bias_output[i], fmt= '%.8f', delimiter = '\t')
        net_file.write('visible bias vector \n')
        numpy.savetxt(net_file, bias_prime_output[i], fmt= '%.8f', delimiter = '\t')