Пример #1
0
    def removeUnPhenotypedSNPData(
        clf, snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls
    ):
        """
		2010-2-25
			remove un-phenotyped ecotypes from the SNP data in order to keep the snp dataset small 
		"""
        sys.stderr.write("Removing un-phenotyped ecotypes from the SNP data ...")
        phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)
        if phenotype_method_id_ls:
            which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls))
        else:  # if not available, take all phenotypes
            which_phenotype_ls = range(len(phenData.col_id_ls))

        phenotyped_ecotype_id_set = set()
        for i in range(len(phenData.row_id_ls)):
            ecotype_id = phenData.row_id_ls[i]
            keep_this_ecotype = False
            for col_index in which_phenotype_ls:
                if phenData.data_matrix[i][col_index] != "NA":  # 2010-2-25 phenotype values are in raw string.
                    keep_this_ecotype = True
                    break
            if keep_this_ecotype:
                phenotyped_ecotype_id_set.add(ecotype_id)

        row_ids_to_be_kept = set()  # 2010-2-21
        no_of_ecotypes_in_total = len(snpData.row_id_ls)
        for row_id in snpData.row_id_ls:
            ecotype_id = row_id[0]  # 1st column is ecotype_id, 2nd is array id
            if ecotype_id in phenotyped_ecotype_id_set:
                row_ids_to_be_kept.add(row_id)
        snpData = SNPData.keepRowsByRowID(snpData, row_ids_to_be_kept)
        no_of_removed = no_of_ecotypes_in_total - len(row_ids_to_be_kept)
        sys.stderr.write("%s removed. Done.\n" % (no_of_removed))
        return snpData
Пример #2
0
    def readInData(
        cls, phenotype_fname, input_fname, eigen_vector_fname, phenotype_method_id_ls, test_type=1, report=0
    ):
        """
		2010-2-25
			call removeUnPhenotypedSNPData() to shrink the snp dataset by removing un-phenotyped ecotypes
		2009-3-20
			refactored out of run(), easy for MpiAssociation.py to call
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(input_fname)
        snpData = SNPData(
            header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix
        )

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            phenotype_fname, turn_into_integer=0
        )
        snpData = cls.removeUnPhenotypedSNPData(
            snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls
        )

        newSnpData, allele2index_ls = snpData.convertSNPAllele2Index(
            report
        )  # 0 (NA) or -2 (untouched) is all converted to -2 as 0 is used to denote allele
        newSnpData.header = snpData.header

        data_matrix_phen = cls.get_phenotype_matrix_in_data_matrix_order(
            strain_acc_list, strain_acc_list_phen, data_matrix_phen
        )
        phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)

        if eigen_vector_fname:
            PC_data = cls.getPCFromFile(eigen_vector_fname)
            PC_matrix = PC_data.PC_matrix
        else:
            if test_type == 4:  # eigen_vector_fname not given for this test_type. calcualte PCs.
                import pca_module

                T, P, explained_var = pca_module.PCA_svd(newSnpData.data_matrix, standardize=False)
                PC_matrix = T
            else:
                PC_matrix = None

        del snpData
        if phenotype_method_id_ls:
            which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls))
        else:  # if not available, take all phenotypes
            which_phenotype_ls = range(len(phenData.col_id_ls))
        pdata = PassingData(
            snpData=newSnpData,
            phenData=phenData,
            PC_matrix=PC_matrix,
            which_phenotype_ls=which_phenotype_ls,
            phenotype_method_id_ls=phenotype_method_id_ls,
        )
        return pdata
Пример #3
0
    def inputNodePrepare(self, snp_info=None):
        """
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname)
        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

        picklef = open(self.snps_context_fname)
        snps_context_wrapper = cPickle.load(picklef)
        del picklef
        gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
        del snps_context_wrapper
        gene_id_ls = gene_id2snps_id_ls.keys()
        gene_id_ls.sort()

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
        phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(
            phenData)  #2009-2-16

        self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
            phenData, Set(self.phenotype_method_id_ls))

        if not self.phenotype_index_ls:
            self.phenotype_index_ls = range(len(phenData.col_id_ls))

        pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
            phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        params_ls = self.generate_params(self.gene_id_fname, pdata,
                                         self.block_size)

        other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
              phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        other_data_pickle = cPickle.dumps(other_data, -1)
        del other_data

        output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
              phenotype_index_ls=self.phenotype_index_ls)
        output_node_data_pickle = cPickle.dumps(output_node_data, -1)

        snpData_pickle = cPickle.dumps(snpData, -1)
        del snpData, data_matrix
        return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
              output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
        return return_data
Пример #4
0
	def inputNodePrepare(self, snp_info=None):
		"""
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
						data_matrix=data_matrix, turn_into_array=1)	#category_list is not used to facilitate row-id matching
		
		picklef = open(self.snps_context_fname)
		snps_context_wrapper = cPickle.load(picklef)
		del picklef
		gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
		del snps_context_wrapper
		gene_id_ls = gene_id2snps_id_ls.keys()
		gene_id_ls.sort()
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
		phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(phenData)	#2009-2-16
		
		self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(self.phenotype_method_id_ls))
		
		if not self.phenotype_index_ls:
			self.phenotype_index_ls = range(len(phenData.col_id_ls))
		
		pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
						phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
		params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size)
		
		other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
								phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
		other_data_pickle = cPickle.dumps(other_data, -1)
		del other_data
		
		output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
								phenotype_index_ls=self.phenotype_index_ls)
		output_node_data_pickle = cPickle.dumps(output_node_data, -1)
		
		snpData_pickle = cPickle.dumps(snpData, -1)
		del snpData, data_matrix
		return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
								output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
		return return_data
Пример #5
0
    def run(self):
        """
		2008-12-02
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.phenotype_fname and self.phenotype_method_id:
            header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
                self.phenotype_fname, turn_into_integer=0)
            phenData = SNPData(
                header=header_phen,
                strain_acc_list=newSnpData.strain_acc_list,
                data_matrix=data_matrix_phen
            )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
            phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                newSnpData.row_id_ls, strain_acc_list_phen,
                phenData.data_matrix)  #tricky, using strain_acc_list_phen

            phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
                phenData, Set([self.phenotype_method_id]))[0]
            phenotype_label = phenData.col_id_ls[phenotype_col_index]
            phenotype_f = open(
                '%s_%s.pheno' %
                (self.output_fname_prefix, phenotype_label.replace('/', '_')),
                'w')
            for phenotype_value in phenData.data_matrix[:,
                                                        phenotype_col_index]:
                if self.phenotype_is_binary:  #binary and non-binary have different NA designator
                    if numpy.isnan(phenotype_value):
                        phenotype_value = 9
                    else:
                        phenotype_value = int(phenotype_value)
                else:
                    if numpy.isnan(phenotype_value):
                        phenotype_value = -100.0
                phenotype_f.write('%s\n' % phenotype_value)
            del phenotype_f

        genotype_f = open('%s.geno' % self.output_fname_prefix, 'w')
        ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'),
                                delimiter='\t')
        snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'),
                                delimiter='\t')

        #transpose it
        newSnpData = transposeSNPData(newSnpData)

        no_of_rows = len(newSnpData.data_matrix)
        no_of_cols = len(newSnpData.data_matrix[0])
        for i in range(no_of_rows):
            snp_id = newSnpData.row_id_ls[i]
            chr, pos = snp_id.split('_')
            allele1 = allele_index2allele_ls[i][0]  #major allele
            allele2 = allele_index2allele_ls[i][1]  #minor allele
            snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2])
            geno_line = ''
            for j in range(no_of_cols):
                if i == 0:  #write out the accessions
                    ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case'])
                allele = newSnpData.data_matrix[i][j]
                if allele == 0:
                    geno_line += '0'
                elif allele == 1:
                    geno_line += '2'
                else:
                    geno_line += '9'
            geno_line += '\n'
            genotype_f.write(geno_line)

        del genotype_f, ind_writer, snp_writer
Пример #6
0
	def run(self):
		"""
		2008-12-02
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		if self.array_id_2nd_column:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		else:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
							data_matrix=data_matrix)	#ignore category_list
		
		newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report)
		
		if self.phenotype_fname and self.phenotype_method_id:
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
			phenData = SNPData(header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
			
			phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0]
			phenotype_label = phenData.col_id_ls[phenotype_col_index]
			phenotype_f = open('%s_%s.pheno'%(self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w')
			for phenotype_value in phenData.data_matrix[:,phenotype_col_index]:
				if self.phenotype_is_binary:	#binary and non-binary have different NA designator
					if numpy.isnan(phenotype_value):
						phenotype_value = 9
					else:
						phenotype_value = int(phenotype_value)
				else:
					if numpy.isnan(phenotype_value):
						phenotype_value = -100.0
				phenotype_f.write('%s\n'%phenotype_value)
			del phenotype_f
		
		genotype_f = open('%s.geno'%self.output_fname_prefix, 'w')
		ind_writer = csv.writer(open('%s.ind'%self.output_fname_prefix, 'w'), delimiter='\t')
		snp_writer = csv.writer(open('%s.snp'%self.output_fname_prefix, 'w'), delimiter='\t')
		
		#transpose it
		newSnpData = transposeSNPData(newSnpData)
		
		no_of_rows = len(newSnpData.data_matrix)
		no_of_cols = len(newSnpData.data_matrix[0])
		for i in range(no_of_rows):
			snp_id = newSnpData.row_id_ls[i]
			chr, pos = snp_id.split('_')
			allele1 = allele_index2allele_ls[i][0]	#major allele
			allele2 = allele_index2allele_ls[i][1]	#minor allele
			snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2])
			geno_line = ''
			for j in range(no_of_cols):
				if i==0:	#write out the accessions
					ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case'])
				allele = newSnpData.data_matrix[i][j]
				if allele==0:
					geno_line += '0'
				elif allele==1:
					geno_line += '2'
				else:
					geno_line += '9'
			geno_line += '\n'
			genotype_f.write(geno_line)
		
		del genotype_f, ind_writer, snp_writer