예제 #1
0
    def __init__( self,
                  datafile,
                  k,
                  covarfile = None,
                  t = 500,
                  num_components = None,
                  stdth = 0.02,
                  out = "refactor"
                ):

        # load methylation data file
        self.meth_data = MethylationData(datafile)
        
        # validate and process all variables
        self.k =                          self._validate_k(k)
        self.t =                          self._validate_t(t)
        self.num_components =             self._validate_num_comp(num_components)
        self.ranked_output_filename =     out + self.RANKED_FILENAME
        self.components_output_filename = out + self.COMPONENTS_FILENAME

        self.run(covarfile, stdth)
예제 #2
0
 def meth_data(self):
     return MethylationData(self.site_imputation, self.imputed_samples,
                            self.imputed_sites_names)
예제 #3
0
class Refactor( object ):
    
    RANKED_FILENAME =       '.out.rankedlist.txt'
    COMPONENTS_FILENAME =   '.out.components.txt'
    VERSION = 1.0 

    def __init__( self,
                  datafile,
                  k,
                  covarfile = None,
                  t = 500,
                  num_components = None,
                  stdth = 0.02,
                  out = "refactor"
                ):

        # load methylation data file
        self.meth_data = MethylationData(datafile)
        
        # validate and process all variables
        self.k =                          self._validate_k(k)
        self.t =                          self._validate_t(t)
        self.num_components =             self._validate_num_comp(num_components)
        self.ranked_output_filename =     out + self.RANKED_FILENAME
        self.components_output_filename = out + self.COMPONENTS_FILENAME

        self.run(covarfile, stdth)


    def _validate_file_path(self, filepath):
        if not os.path.exists(filepath) :
            print("ERROR: The file '%s' doesn't exist. Exiting" % filepath)
            self._terminate_refactor()

    def _validate_k(self,k):
        if not (k >= 2 and k <= self.meth_data.samples_size):
            print("ERROR: k must be at least 2 and smaller than the number of samples. k = %s, samples = %s" % (k, self.meth_data.samples_size))
            self._terminate_refactor()

        return k

    def _validate_t(self,t):
        if t > self.meth_data.sites_size or t < self.k : 
            print("ERROR: t cannot be greater than the number of sites or smaller than k . t = %s, sites = %s, k = %s" % (t, self.meth_data.sites_size, self.k))
            self._terminate_refactor()

        return t

    def _validate_num_comp(self,num_comp):
        if num_comp and not (num_comp >= self.k and num_comp <= self.meth_data.samples_size):
            print("ERROR: the number of components must be at least k and smaller than the number of samples. num_comp = %s, samples = %s, k = %s" % (num_comp, self.meth_data.samples_size, self.k))
            self._terminate_refactor()

        return num_comp if num_comp else self.k

    def _load_and_validate_covarfile(self, filepath):
      ids = loadtxt(filepath, dtype = str)[:,0].astype(str)
      if len(ids) != len(self.meth_data.samples_ids) or sum(self.meth_data.samples_ids[i] == ids[i] for i in range(len(ids))) < len(ids):
            print("ERROR: The order of the samples in the covariates file must be the same as the order in the data file")
            self._terminate_refactor()
      covs = loadtxt(filepath, dtype = str)[:,1:].astype(float)
      return covs

    def _adjust_data(self, covs):
        data_adj = self.meth_data._copy()
        for i in range(self.meth_data.sites_size):
            data_adj.data[i,:] = Regress.regress(self.meth_data.data[i,:],covs)
        return data_adj

    def run(self, covarfile, stdth):
        print("Excluding sites with low variance (std < %s)..." % str(stdth))
        before = self.meth_data.sites_size
        self.meth_data._filter_sites_by_std(stdth)
        print("%d sites were excluded due to low variance..." % (before - self.meth_data.sites_size) )

        if (covarfile is not None):
            print('Adjust data for covariates...')
            covs = self._load_and_validate_covarfile(covarfile)
            self.meth_data = self._adjust_data(covs)

        print('Starting ReFACTor v%s...' % self.VERSION);
        self.components, self.ranked_sites, self.standard_pca = self._refactor()
        print('ReFACTor is done!')

   
    def _write_file( self, filepath, data):   
        if  os.path.exists(filepath):
            os.remove(filepath) 
        with open(filepath, 'w') as f:
            f.write(data)

    def _refactor( self ):
        print('Running a standard PCA...')
        pca_out1 = pca.PCA(self.meth_data.data.transpose()) 

        print('Computing a low rank approximation of the input data and ranking sites...')
        x = self._low_rank_approximation(pca_out1.P, pca_out1.U, self.k)
        
        An = preprocessing.StandardScaler( with_mean = True, with_std = False ).fit(self.meth_data.data.transpose()).transform(self.meth_data.data.transpose())
        Bn = preprocessing.StandardScaler( with_mean = True, with_std = False ).fit(x).transform(x)
        # ** python3.5 devision: sqrt return float on both python 2.7 and 3.5 and devision by float behave the same on both of the versions
        An = An * ( 1 / sqrt((An**2).sum(axis=0)) ) 
        Bn = Bn * ( 1 / sqrt((Bn**2).sum(axis=0)) )

        distances = self._euclidean_distance(An, Bn)
        ranked_list = distances.argsort()

        print('Computing the ReFACTor components...')
        sites = ranked_list[0:self.t]

        pca_out2 = pca.PCA(self.meth_data.data[sites,:].transpose())
        score = pca_out2.P

        print('Saving a ranked list of the data features...')
        #data = '\n'.join(['%s\t%s'% (index+1, self.meth_data.cpgnames[index]) for index in ranked_list])
        data = '\n'.join(['%s'% self.meth_data.cpgnames[index] for index in ranked_list])
        self._write_file(self.ranked_output_filename, data)

        print('Saving the ReFACTor components...')
        data = '\n'.join(['\t'.join([str(i) for i in line]) for line in score[:,0:self.num_components   ]])
        self._write_file(self.components_output_filename, data)
        
        return score[:,0:self.num_components], ranked_list, pca_out1.P[:,0:self.k]


    def _low_rank_approximation(self, A, B, i):
        return dot(A[:,0:i], B[:,0:i].transpose())

    def _euclidean_distance(self, A, B):
        return sqrt(((A - B)**2).sum(axis=0))

    def _terminate_refactor(self):
        print("ReFACTor was terminated.")
        exit(2)


    @staticmethod
    def estimate_k(methylation_data, max_k):

        min_k = 2

        # Find the eigenvalues of the covariance matrix
        eigs = sorted(linalg.eigvals(dot(methylation_data.data.transpose(),methylation_data.data)),key=lambda x: -x)

        # For each eigenvalue i compute its score: -log of the ratio between the i-th eigenvalue and the (i-1)-th eigenvalue.
        scores = [0 for i in range(max_k-min_k+1)]
        counter = 0
        for i in range(min_k,max_k):
            scores[counter] = -log(eigs[i-1] / eigs[i-2])
            counter += 1

        # Plot #eigenvalue vs. scores
        fig, axes = plot.subplots(nrows=1, ncols=1)
        plot.plot([i for i in range(min_k,max_k+1)], scores)
        plot.xlabel('# eigenvalue')
        plot.ylabel('score')
        filename = "estimate_k_results.png"
        plot.savefig(filename)
        print("Plotted and saved the results into %s" % filename)


    @staticmethod
    def estimate_t(methylation_data, k, numsites):

        span = 9 # parameter for the moving average (the window size for constructing the average); must be an even number.

        # Compute a low rank approximation of the data
        pca_res = pca.PCA(methylation_data.data.transpose()) 
        x = dot(pca_res.P[:,0:k], pca_res.U[:,0:k].transpose())        
        
        # Compute the distance of each site form its low rank approximation
        An = preprocessing.StandardScaler( with_mean = True, with_std = False ).fit(methylation_data.data.transpose()).transform(methylation_data.data.transpose())
        Bn = preprocessing.StandardScaler( with_mean = True, with_std = False ).fit(x).transform(x)
        # ** python 3 devision: sqrt return float on both python 2.7 and 3.5 and devision by float behave the same on both of the versions
        An = An * ( 1 / sqrt((An**2).sum(axis=0)) ) 
        Bn = Bn * ( 1 / sqrt((Bn**2).sum(axis=0)) )
        distances = sorted(sqrt(((An - Bn)**2).sum(axis=0)))

        # Compute a score for each site i of the sorted distances list: the moving average of dist(i) - dist(i-1)
        distances_diff = [distances[0]] + [distances[i] - distances[i-1] for i in range(1,numsites)]

        scores = [0 for i in range(numsites)]
        mid = (span-1) / 2        
        for i in range(0,numsites):
            l = [distances_diff[j] for j in range(min(0,i-mid),min(i+mid,numsites))]
            scores[i] = sum(l) / float(len(l))
        
        # Plot sites vs. scores
        fig, axes = plot.subplots(nrows=1, ncols=1)
        plot.plot([i for i in range(1,numsites+1)], scores)
        plot.xlabel('site')
        plot.ylabel('score')
        filename = "estimate_t_results.png"
        plot.savefig(filename)
        print("Plotted and saved the results into %s" % filename)