def run(self):
        """
		2008-06-02
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        if self.row_matching_by_which_value == 0:
            snpData1 = SNPData(input_fname=self.input_fname1,
                               turn_into_array=1,
                               ignore_2nd_column=1)
        else:
            snpData1 = SNPData(input_fname=self.input_fname1,
                               turn_into_array=1)
        snpData2 = SNPData(input_fname=self.input_fname2, turn_into_array=1)

        if self.row_matching_by_which_value == 1 or self.row_matching_by_which_value == 2:
            row_matching_by_which_value = self.row_matching_by_which_value - 1
        else:
            row_matching_by_which_value = None
        twoSNPData = TwoSNPData(
            SNPData1=snpData1,
            SNPData2=snpData2,
            debug=self.debug,
            row_matching_by_which_value=row_matching_by_which_value)
        newSnpData = twoSNPData.order2ndSNPDataRowsSameAs1stSNPData()
        newSnpData.tofile(self.output_fname)
示例#2
0
    def inputNodePrepare(self, snp_info=None):
        """
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname)
        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

        picklef = open(self.snps_context_fname)
        snps_context_wrapper = cPickle.load(picklef)
        del picklef
        gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
        del snps_context_wrapper
        gene_id_ls = gene_id2snps_id_ls.keys()
        gene_id_ls.sort()

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
        phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(
            phenData)  #2009-2-16

        self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
            phenData, Set(self.phenotype_method_id_ls))

        if not self.phenotype_index_ls:
            self.phenotype_index_ls = range(len(phenData.col_id_ls))

        pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
            phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        params_ls = self.generate_params(self.gene_id_fname, pdata,
                                         self.block_size)

        other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
              phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        other_data_pickle = cPickle.dumps(other_data, -1)
        del other_data

        output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
              phenotype_index_ls=self.phenotype_index_ls)
        output_node_data_pickle = cPickle.dumps(output_node_data, -1)

        snpData_pickle = cPickle.dumps(snpData, -1)
        del snpData, data_matrix
        return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
              output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
        return return_data
    def run(self):
        """
		2008-08-11
			the database interface changed in variation.src.dbsnp
		2008-05-06
		"""
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        if self.debug:
            import pdb
            pdb.set_trace()

        db = DBSNP(username=self.user,
                   password=self.passwd,
                   hostname=self.hostname,
                   database=self.dbname)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()

        snps_name2possible_mappings, snps_name2snps_id = self.get_snps_name2possible_mappings(
            db)

        from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
        header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data(
            self.input_fname1)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
             col_id2id=snps_name2snps_id, snps_table='dbsnp.snps')

        header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data(
            self.input_fname2)
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix,\
            snps_table='stock_250k.snps')

        twoSNPData = TwoSNPData384(SNPData1=snpData1,
                                   SNPData2=snpData2,
                                   curs=curs,
                                   user=self.user)

        readme = formReadmeObj(sys.argv, self.ad, README)
        session.save(readme)
        session.flush()
        twoSNPData.figureOutABMapping(session, readme,
                                      snps_name2possible_mappings)
        if self.commit:
            curs.execute("commit")
            session.commit()
        else:
            session.rollback()
示例#4
0
    def run(self):
        """
		2008-5-12
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        #database connection and etc
        db = self.db_250k

        session = db.session
        session.begin()

        delimiter = figureOutDelimiter(self.inputFname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.inputFname, delimiter=delimiter)

        if self.snp_id_type == 1:
            #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs.
            # but if col-id is already chr_pos, it's fine.
            new_header = header[:2]
            data_matrix_col_index_to_be_kept = []
            for i in xrange(2, len(header)):
                snp_id = header[i]
                chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, )
                if chr_pos is not None:
                    data_matrix_col_index_to_be_kept.append(i - 2)
                    new_header.append(chr_pos)
            # to remove no-db_id columns from data matrix
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept]
            header = new_header

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData,
                                                need_transposeSNPData=1,
                                                report=self.report)
        chromosomes = [
            rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls
        ]
        snpsdata.writeRawSnpsDatasToFile(self.outputFname,
                                         rawSnpsData_ls,
                                         chromosomes=chromosomes,
                                         deliminator=',',
                                         withArrayIds=self.array_id_2nd_column)
示例#5
0
    def run(self):
        """
		2009-2-12
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()

        chr2CNV_probe_ls_pickle_fname = '/tmp/chr2CNV_probe_ls.pickle'
        if not os.path.isfile(chr2CNV_probe_ls_pickle_fname):
            chr2CNV_probe_ls = self.get_chr2CNV_probe_ls(
                curs, self.probes_table)
            picklef = open(chr2CNV_probe_ls_pickle_fname, 'w')
            cPickle.dump(chr2CNV_probe_ls, picklef, -1)
            del picklef
        else:
            picklef = open(chr2CNV_probe_ls_pickle_fname, 'r')
            chr2CNV_probe_ls = cPickle.load(picklef)
            del picklef
        snpData = SNPData(input_fname=self.input_fname,
                          turn_into_array=1,
                          ignore_2nd_column=1)

        probeData = self.get_probe_id2snp_id_ls(chr2CNV_probe_ls,
                                                snpData.col_id_ls)
        SNP2Col_allele = self.get_SNP2Col_allele(snpData)

        cnvIntensityData = SNPData(input_fname=self.cnv_input_fname,
                                   turn_into_array=1,
                                   ignore_2nd_column=1,
                                   matrix_data_type=float)

        cnvQCData = self.getCNVQCMatrix(probeData.probe_id2snp_id_ls,
                                        probeData.snp_id2tup, snpData,
                                        SNP2Col_allele, cnvIntensityData)
        plotdata_pickle_fname = '/tmp/CNV_plot_data.pickle'
        picklef = open(plotdata_pickle_fname, 'w')
        cPickle.dump(cnvQCData.plotData, picklef, -1)
        del picklef
        cnvQCData.mismatchData.tofile('%s_mismatch.tsv' %
                                      self.output_fname_prefix)
        cnvQCData.insertionData.tofile('%s_insertion.tsv' %
                                       self.output_fname_prefix)
        cnvQCData.deletionData.tofile('%s_deletion.tsv' %
                                      self.output_fname_prefix)
        cnvQCData.qcData.tofile('%s_qc.tsv' % self.output_fname_prefix)
示例#6
0
    def run(self):
        cnvIntensityData = SNPData(input_fname=self.input_fname,
                                   turn_into_array=1,
                                   ignore_2nd_column=1,
                                   matrix_data_type=float)
        probe_pos_ls = []
        avg_intensity_ls = []

        if self.run_type == 1:
            newDataMatrix = numpy.ones(cnvIntensityData.data_matrix.shape,
                                       numpy.int)

        for j in range(cnvIntensityData.data_matrix.shape[1]):
            probe_id = cnvIntensityData.col_id_ls[j]
            probe_id = probe_id.split('_')
            probe_id = map(int, probe_id)
            probe_pos_ls.append(probe_id[1])
            avg_intensity_ls.append(
                numpy.sum(cnvIntensityData.data_matrix[:, j]))
            if self.run_type == 1:
                for i in range(cnvIntensityData.data_matrix.shape[0]):
                    if cnvIntensityData.data_matrix[i][
                            j] <= self.max_del_intensity:
                        newDataMatrix[i][j] = -1

        if self.run_type == 1:
            newData = SNPData(row_id_ls=cnvIntensityData.row_id_ls,
                              col_id_ls=cnvIntensityData.col_id_ls,
                              data_matrix=newDataMatrix)
            newData.tofile(self.output_fname)
        elif self.run_type == 2:
            block_size = 1000
            no_of_probes = len(probe_pos_ls)
            no_of_blocks = no_of_probes / block_size
            for i in range(no_of_blocks):
                if i * block_size > no_of_probes:
                    break
                start_index = i * block_size
                end_index = min((i + 1) * block_size, no_of_probes)
                fname = '%s_%s_%s.png' % (self.output_fname,
                                          probe_pos_ls[start_index],
                                          probe_pos_ls[end_index])
                pylab.clf()
                pylab.plot(probe_pos_ls[start_index:end_index],
                           avg_intensity_ls[start_index:end_index],
                           '.',
                           markersize=4,
                           alpha=0.4)
                pylab.xlabel('chromosome position')
                pylab.ylabel('sum intensity')
                pylab.savefig(fname, dpi=300)
	def loadDataStructure(self, gene_annotation_picklef, min_MAF=0, min_distance=20000, \
						list_type_id=None, snp_matrix_fname=None, snp_matrix_data_type=1):
		"""
		2009-5-30
			add argument snp_matrix_fname
		2008-11-25
		2008-10-01
			wrap a few functions up, convenient for both run() and drawSNPRegion()
		"""
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db = db
		snp_info = self.getSNPInfo(db)
		gene_annotation = self.dealWithGeneAnnotation(gene_annotation_picklef)
		if list_type_id:
			candidate_gene_list = self.getGeneList(list_type_id)
			candidate_gene_set = Set(candidate_gene_list)
		else:
			candidate_gene_set = Set()
		
		if snp_matrix_fname:
			if snp_matrix_data_type==3:
				matrix_data_type=float		#2009-3-23 for CNV amplitude file
			else:
				matrix_data_type=int
			snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=matrix_data_type)			
			#2008-12-05 fake a snp_info for findSNPsInRegion
			self.construct_chr_pos2index_forSNPData(snpData)
		else:
			snpData = None
			
		return_data = PassingData(gene_annotation=gene_annotation, snp_info=snp_info, \
								candidate_gene_set=candidate_gene_set, snpData=snpData)
		return return_data
示例#8
0
    def qcDataMatrixVSsnpData(self, pdata, snps_name2snps_id, snpData2, curs,
                              session, readme):
        """
		2008-08-16
			split from run() to enable one_by_one option
		"""
        #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
        snpData1 = SNPData(header=pdata.header, strain_acc_list=pdata.ecotype_id_ls, category_list=pdata.call_info_id_ls, data_matrix=pdata.data_matrix, \
            min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
            max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps')
        #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug)

        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            twoSNPData.save_col_wise(session, readme)
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        passingdata = PassingData()
        passingdata.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate
        passingdata.row_id12row_id2 = twoSNPData.row_id12row_id2
        return passingdata
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        snpData = SNPData(input_fname=self.inputFname,
                          turn_into_array=1,
                          ignore_2nd_column=1)
        snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([]))
        if self.min_MAF and self.min_MAF > 0:
            snpData = SNPData.removeColsByMAF(snpData,
                                              min_MAF=self.min_MAF,
                                              NA_set=set([]))

        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.writer.makeupHeaderFromSampleIDList(
            sampleIDList=snpData.row_id_ls)
        self.writer.writeMetaAndHeader()

        counter = 0
        for j in xrange(len(snpData.col_id_ls)):
            snp_id = snpData.col_id_ls[j]
            chromosome, start = snp_id.split('_')[:2]
            genotype_ls = snpData.data_matrix[:, j]
            genotype_ls = utils.dict_map(number2di_nt, genotype_ls)
            genotype_ls_vcf = []
            alleleNucleotide2Number = {}
            alleleNumber2Nucleotide = {}
            for genotype in genotype_ls:
                if genotype == 'NA':
                    genotype_ls_vcf.append("./.")
                elif len(genotype) == 2:
                    for allele in genotype:
                        if allele not in alleleNucleotide2Number:
                            alleleNumber = len(alleleNucleotide2Number)
                            alleleNucleotide2Number[allele] = alleleNumber
                            alleleNumber2Nucleotide[alleleNumber] = allele
                    genotype_ls_vcf.append(
                        "%s/%s" % (alleleNucleotide2Number[genotype[0]],
                                   alleleNucleotide2Number[genotype[1]]))

                else:
                    genotype_ls_vcf.append("./.")
            refAllele = alleleNumber2Nucleotide[0]
            if 1 not in alleleNumber2Nucleotide:
                altAllele = refAllele
            else:
                altAllele = alleleNumber2Nucleotide[1]
            row = [
                chromosome, start, ".", refAllele, altAllele, 999, 'PASS',
                "DP=100", "GT"
            ] + genotype_ls_vcf
            self.writer.writerow(row)
            counter += 1
        sys.stderr.write("  %s records.\n" % (counter))
        self.writer.close()
示例#10
0
    def create_init_data(self):
        """
		2009-6-5
			add argument ignore_het=1 to snpData_2010_149_384 & snpData_perlegen
		2008-05-12
			initial data loading on node 0
		"""
        init_data = PassingData()
        init_data.snpData_250k = SNPData(input_fname=self.input_fname,
                                         turn_into_array=1)
        init_data.snpData_2010_149_384 = SNPData(
            input_fname=self.fname_2010_149_384,
            turn_into_array=1,
            ignore_2nd_column=1,
            ignore_het=1)
        init_data.snpData_perlegen = SNPData(input_fname=self.fname_perlegen,
                                             turn_into_array=1,
                                             ignore_2nd_column=1,
                                             ignore_het=1)
        param_d = self.generate_parameters(self.parameter_names)
        init_data.param_d = param_d
        return init_data
示例#11
0
    def run(self):
        """
		2009-5-28
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB(drivername=self.drivername,
                          username=self.db_user,
                          password=self.db_passwd,
                          hostname=self.hostname,
                          database=self.dbname,
                          schema=self.schema)
        db.setup(create_tables=False)

        nativename2tg_ecotypeid_set = getNativename2TgEcotypeIDSet(
            db.metadata.bind, turnUpperCase=True)
        ecotype_id_set_250k_in_pipeline = get_ecotype_id_set_250k_in_pipeline(
            ArrayInfo)
        ecotypeid2tg_ecotypeid = get_ecotypeid2tg_ecotypeid(db.metadata.bind)

        #turn_into_integer=2 because it's not nucleotides
        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.input_fname, turn_into_integer=2, matrix_data_type=float)
        data_matrix_phen = numpy.array(data_matrix_phen)

        #2009-8-19 bug here. strain_acc_list_phen is not unique for each row. causing replicates to have the same value
        #from Association import Association
        #data_matrix_phen = Association.get_phenotype_matrix_in_data_matrix_order(strain_acc_list_phen, strain_acc_list_phen, data_matrix_phen)

        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)

        ecotype_id_ls = self.straightenEcotypeID(phenData.row_id_ls, nativename2tg_ecotypeid_set, ecotypeid2tg_ecotypeid, \
                  ecotype_id_set_250k_in_pipeline)

        session = db.session
        session.begin()
        if self.run_type == 1:
            self.putPhenotypeIntoDB(db, phenData, ecotype_id_ls)
        elif self.run_type == 2:
            self.putReplicatePhenotypeIntoDB(db, phenData, ecotype_id_ls)
        else:
            sys.stderr.write("Unsupported run type: %s.\n" % (self.run_type))
        if self.commit:
            session.commit()
示例#12
0
	def run(self):
		"""
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		cnvIntensityData = self.getBeforeGADAIntensityData(self.input_fname)
		#cnvIntensityData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float)
		
		qcData = SNPData(input_fname=self.qc_fname, turn_into_array=1, ignore_2nd_column=1)
		if not os.path.isdir(self.output_dir):
			os.makedirs(self.output_dir)
		
		for probe_id in qcData.col_id_ls:
			if probe_id in cnvIntensityData.col_id2col_index:
				cnv_col_index = cnvIntensityData.col_id2col_index[probe_id]
				qc_col_index = qcData.col_id2col_index[probe_id]
				count_ls = []
				intensity_ls = []
				for i in range(len(qcData.row_id_ls)):
					row_id = qcData.row_id_ls[i]
					if qcData.data_matrix[i][qc_col_index]>=0 and row_id in cnvIntensityData.row_id2row_index:
						cnv_row_index = cnvIntensityData.row_id2row_index[row_id]
						count = qcData.data_matrix[i][qc_col_index]
						count_ls.append(count)
						intensity_ls.append(cnvIntensityData.data_matrix[cnv_row_index][cnv_col_index])
				count_set = set(count_ls)
				if len(count_set)>0 and count_set!=set([0]):
					pylab.clf()
					ax = pylab.axes([0.1, 0.1, 0.8, 0.8], frameon=False)
					ax.grid(True, alpha=0.3)
					pylab.plot(count_ls, intensity_ls, '.', markersize=5, alpha=0.4)
					pylab.xlabel('count')
					pylab.ylabel('CNV probe intensity')
					pylab.ylim([-1,1])
					xlim = list(ax.get_xlim())
					xlim[0] -= 1
					xlim[1] += 1
					ax.set_xlim(xlim)
					pylab.title(probe_id)
					pylab.savefig(os.path.join(self.output_dir, '%s.png'%probe_id), dpi=300)
示例#13
0
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		snpData = SNPData(input_fname=self.inputFname, turn_into_array=1, ignore_2nd_column=1)
		snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([]))
		if self.min_MAF>0:
			snpData = SNPData.removeColsByMAF(snpData,min_MAF=self.min_MAF, NA_set=set([]))
		snpData.col_id_ls = map(int, snpData.col_id_ls)
		snpData.row_id_ls = map(int, snpData.row_id_ls)
		f = h5py.File(self.outputFname, 'w')
		import numpy
		#snpData.data_matrix.dtype = numpy.int16
		dset = f.create_dataset("data_matrix", data=snpData.data_matrix, maxshape=(None, None))	#numpy.array(snpData.data_matrix, dtype=numpy.int64)
		col_id_ls_dset = f.create_dataset('col_id_ls', data=snpData.col_id_ls, maxshape=(None,))
		row_id_ls_dset = f.create_dataset('row_id_ls', data=snpData.row_id_ls, maxshape=(None,))
		f.close()
示例#14
0
    def getPhenotypeData(cls, curs, phenotype_avg_table=None, phenotype_method_table=None, ecotype_table='stock.ecotype', get_raw_data=1,\
         getPublicPhenotype=False):
        """
		2012.9.28
			add argument getPublicPhenotype
		2009-2-2
			wrap up all other 3 methods
		"""
        phenotype_info = cls.get_phenotype_method_id_info(curs, phenotype_avg_table=phenotype_avg_table, \
                phenotype_method_table=phenotype_method_table, getPublicPhenotype=getPublicPhenotype)
        ecotype_id2index, ecotype_id_ls, ecotype_name_ls = cls.get_ecotype_id2info(curs, phenotype_avg_table=phenotype_avg_table,\
                      ecotype_table=ecotype_table, getPublicPhenotype=getPublicPhenotype)
        data_matrix = cls.get_matrix(curs, phenotype_avg_table, ecotype_id2index=ecotype_id2index, phenotype_info=phenotype_info, \
              get_raw_data=get_raw_data, phenotype_method_table=phenotype_method_table,\
              getPublicPhenotype=getPublicPhenotype)
        pheno_data = SNPData(col_id_ls=phenotype_info.phenotype_id_ls,
                             row_id_ls=ecotype_id_ls,
                             data_matrix=data_matrix)
        pheno_data.row_label_ls = ecotype_name_ls
        pheno_data.col_label_ls = phenotype_info.method_id_name_ls
        return pheno_data
    def getHaploGroupSNPMatrix(self):
        """
		2009-4-18
		"""
        sys.stderr.write("Getting HaploGroup SNP matrix ...")

        col_id_ls = []
        row_id_ls = []
        if self.debug:
            no_of_rows = 10
        else:
            no_of_rows = StockDB.HaploGroup.query.count()

        col_id2col_index = {}
        for row in StockDB.SNPs.query.order_by(
                StockDB.SNPs.chromosome).order_by(StockDB.SNPs.position):
            col_id_ls.append(row.id)
            col_id2col_index[row.id] = len(col_id2col_index)

        no_of_cols = len(col_id2col_index)

        data_matrix = numpy.zeros([no_of_rows, no_of_cols], numpy.int8)
        rows = StockDB.HaploGroup.query.all()
        row_index = 0
        for row in rows:
            data_rows = StockDB.FilteredCalls.query.filter_by(
                ecotypeid=row.ref_ecotypeid)
            row_index = len(row_id_ls)
            for one_call in data_rows:
                nt_number = nt2number[one_call.allele]
                col_index = col_id2col_index[one_call.snpid]
                data_matrix[row_index][col_index] = nt_number
            row_id_ls.append(row.id)
            if self.debug and row_index == no_of_rows - 1:
                break
        snpData = SNPData(col_id_ls=col_id_ls,
                          row_id_ls=row_id_ls,
                          data_matrix=data_matrix)
        sys.stderr.write("Done.\n")
        return snpData
示例#16
0
    def run(self):
        """
		2008-9-7
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
            data_matrix=data_matrix)
        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.mapping_fname:  #output allele_index2allele_ls
            self.output_allele2index_ls(snpData, allele_index2allele_ls,
                                        self.mapping_fname)

        newSnpData.tofile(self.output_fname)
示例#17
0
	def getBeforeGADAIntensityData(self, input_fname):
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		data_matrix, probe_id_ls, chr_pos_ls, header = CNVNormalize.get_input(input_fname)
		
		col_id_ls = []
		for chr_pos in chr_pos_ls:
			col_id_ls.append('%s_%s'%(chr_pos[0], chr_pos[1]))
		
		ecotype_id_ls = []
		for array_id in header[1:-2]:
			array = Stock_250kDB.ArrayInfo.get(int(array_id))
			if array:
				ecotype_id = array.maternal_ecotype_id
				
			else:
				ecotype_id = -1
			ecotype_id_ls.append('%s'%ecotype_id)
		cnvIntensityData = SNPData(row_id_ls=ecotype_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix.transpose())
		return cnvIntensityData
示例#18
0
    def run(self):
        """
		
		"""
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        if self.debug:
            import pdb
            pdb.set_trace()
        chr_pos2ancestral_allele = self.get_chr_pos2ancestral_allele(
            self.ancestral_allele_fname)
        pheno_data = SNPData(input_fname=self.phenotype_fname,
                             turn_into_integer=0,
                             ignore_2nd_column=1)
        pheno_data = self.process_phenotype_data(pheno_data)

        geno_data = SNPData(input_fname=self.genotype_fname,
                            turn_into_array=1,
                            matrix_data_type=int,
                            ignore_2nd_column=1)

        query = Stock_250kDB.ResultsMethod.query.filter_by(
            call_method_id=self.call_method_id).filter_by(
                analysis_method_id=self.analysis_method_id).filter_by(
                    phenotype_method_id=self.phenotype_method_id)
        if query.count() == 1:
            rm = query.first()
        elif query.count() > 1:
            sys.stderr.write(
                "Warning: more than 1 results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"
                % (self.call_method_id, self.analysis_method_id,
                   self.phenotype_method_id))
            rm = query.first()
        else:
            sys.stderr.write(
                "Error: no results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"
                % (self.call_method_id, self.analysis_method_id,
                   self.phenotype_method_id))
            sys.exit(3)

        phenotype_ls_data = self.get_phenotype_ls(rm, self.no_of_top_snps, chr_pos2ancestral_allele, pheno_data, geno_data, \
                  self.min_MAF, results_directory=self.input_dir)

        import pylab
        pylab.clf()
        hist_patch_ls = []
        legend_ls = []
        if len(phenotype_ls_data.ancestral_allele_phenotype_ls) > 2:
            n1 = pylab.hist(phenotype_ls_data.ancestral_allele_phenotype_ls,
                            100,
                            alpha=0.4,
                            normed=1)
            hist_patch_ls.append(
                n1[2][0])  #first patch in all patches of a histogram
            legend_ls.append('ancestral allele')
        if len(phenotype_ls_data.derived_allele_phenotype_ls) > 2:
            n2 = pylab.hist(phenotype_ls_data.derived_allele_phenotype_ls,
                            100,
                            alpha=0.4,
                            normed=1,
                            facecolor='r')
            hist_patch_ls.append(n2[2][0])
            legend_ls.append('derived allele')
        pylab.legend(hist_patch_ls, legend_ls)
        if self.output_fname_prefix:
            pylab.savefig('%s.svg' % self.output_fname_prefix, dpi=300)
示例#19
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.db_user,
                                       password=self.db_passwd,
                                       hostname=self.hostname,
                                       database=self.dbname,
                                       schema=self.schema)
        db.setup(create_tables=False)
        session = db.session

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(
            header=header_phen,
            strain_acc_list=strain_acc_list_phen,
            data_matrix=data_matrix_phen
        )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            phenData.row_id_ls, strain_acc_list_phen,
            phenData.data_matrix)  #tricky, using strain_acc_list_phen

        phenotype_col_index1 = self.findOutWhichPhenotypeColumn(
            phenData, Set([self.phenotype_method_id1]))[0]
        phenotype_col_index2 = self.findOutWhichPhenotypeColumn(
            phenData, Set([self.phenotype_method_id2]))[0]

        x_ls = []
        y_ls = []
        for i in range(phenData.data_matrix.shape[0]):
            if not numpy.isnan(
                    phenData.data_matrix[i]
                [phenotype_col_index1]) and not numpy.isnan(
                    phenData.data_matrix[i][phenotype_col_index2]):
                x_ls.append(phenData.data_matrix[i][phenotype_col_index1])
                y_ls.append(phenData.data_matrix[i][phenotype_col_index2])

        pylab.clf()
        pylab.title('Phenotype Contrast')
        pylab.plot(x_ls, y_ls, '.', alpha=0.6)
        pylab.grid(alpha=0.3)
        phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(
            self.phenotype_method_id1)
        phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(
            self.phenotype_method_id2)
        pylab.xlabel(phenotype_method1.short_name)
        pylab.ylabel(phenotype_method2.short_name)

        #draw diagonal line to show perfect correlation
        max_min_value = max(min(x_ls), min(y_ls))
        min_max_value = min(max(x_ls), max(y_ls))
        pylab.plot([max_min_value, min_max_value],
                   [max_min_value, min_max_value],
                   c='g',
                   alpha=0.7)

        png_output_fname = '%s.png' % self.output_fname_prefix
        pylab.savefig(png_output_fname, dpi=400)
        pylab.savefig('%s.svg' % self.output_fname_prefix)
示例#20
0
    def doFilter(self, snpData, snpData_qc_strain, snpData_qc_snp, min_call_probability, max_call_mismatch_rate, max_call_NA_rate,\
       max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size , output_dir=None):
        """
		2009-10-11
			replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 
		2008-12-22
			replace '=' and ',' with '_' in the output filename
		2008-05-19
			matrix_ls has to be of length >0 before concatenation
		2008-05-19
			use SNPData structure
		2008-05-18
			add onlyCommon=True to FilterAccessions.filterByError()
		2008-05-17
			add argument output_dir. if it's available, output data matrix before and after imputation
		2008-05-12
			add
			qcdata.no_of_accessions_filtered_by_mismatch
			qcdata.no_of_accessions_filtered_by_na
			qcdata.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed
		
		2008-05-11
			split up from computing_node_handler
		"""
        qcdata = PassingData()
        twoSNPData = TwoSNPData(SNPData1=snpData, SNPData2=snpData_qc_strain, \
            row_matching_by_which_value=0, debug=self.debug)
        row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        del twoSNPData

        newSnpData = SNPData.removeRowsByMismatchRate(snpData,
                                                      row_id2NA_mismatch_rate,
                                                      max_call_mismatch_rate)
        qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_removed

        newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate)
        qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_removed

        twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \
            row_matching_by_which_value=0, debug=self.debug)
        col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
        del twoSNPData
        newSnpData = SNPData.removeColsByMismatchRate(newSnpData,
                                                      col_id2NA_mismatch_rate,
                                                      max_snp_mismatch_rate)
        qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch

        newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate)
        qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na

        twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \
            row_matching_by_which_value=0, debug=self.debug)
        newSnpData = twoSNPData.mergeTwoSNPData(priority=2)
        del twoSNPData
        #MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2)

        newSnpData = SNPData.removeMonomorphicCols(newSnpData)
        qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols

        #FilterSnps.filterMonomorphic(snpsd_250k_tmp)

        if output_dir:
            #output data here
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            output_fname_prefix_ls = ['min_oligo_call_probability_%s'%min_call_probability,\
                  'max_array_mismatch_rate_%s'%max_call_mismatch_rate,\
                  'max_array_NA_rate_%s'%max_call_NA_rate,\
                  'max_snp_mismatch_rate_%s'%max_snp_mismatch_rate,\
                  'max_snp_NA_rate_%s'%max_snp_NA_rate,\
                  'npute_window_size_%s'%npute_window_size]
            output_fname = os.path.join(
                output_dir,
                '_'.join(output_fname_prefix_ls + ['before_imputation.tsv']))
            newSnpData.tofile(output_fname)
            #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]
            #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)
        """
		qcdata.no_of_snps_filtered_by_mismatch = 0
		qcdata.no_of_snps_filtered_by_na = 0
		qcdata.no_of_monomorphic_snps_removed = 0
		for snpsd in snpsd_250k_tmp:
			qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed
		"""

        #snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp)

        twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, \
            row_matching_by_which_value=0)
        row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise()
        col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise()
        del twoSNPData0

        result = []
        #for npute_window_size in npute_window_size_ls:
        #snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp)	#deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well
        if len(newSnpData.row_id_ls) > 5:
            snps_name_ls = newSnpData.col_id_ls
            ## 2009-10-8 use NPUTE.samplingImpute()
            imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute(snps_name_ls, newSnpData.data_matrix, \
                         input_file_format=1, input_NA_char=0, lower_case_for_imputation=False,\
                         npute_window_size=int(npute_window_size), \
                         no_of_accessions_per_sampling=300, coverage=3)
            snpData_imputed = SNPData(row_id_ls=newSnpData.row_id_ls,
                                      col_id_ls=new_snps_name_ls,
                                      data_matrix=imputed_matrix)
            """
			## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below
			chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls)
			chr_ls = chr2no_of_snps.keys()
			chr_ls.sort()
			snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[])
			matrix_ls = []
			for chromosome in chr_ls:
				if chr2no_of_snps[chromosome]>5:	#enough for imputation
					npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \
									input_file_format=1, input_NA_char=0)
					imputeData(npute_data_struc, int(npute_window_size))
					matrix_ls.append(npute_data_struc.snps)
					snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls
			if len(matrix_ls)>0:
				snpData_imputed.data_matrix = numpy.transpose(numpy.concatenate(matrix_ls))
			"""
            if output_dir:  #2008-05-16 write the data out if output_fname is available
                #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]	#already produced in the previous before_imputation output
                output_fname = os.path.join(
                    output_dir, '_'.join(output_fname_prefix_ls +
                                         ['after_imputation.tsv']))
                #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)
                snpData_imputed.tofile(output_fname)

            twoSNPData1 = TwoSNPData(SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, \
                row_matching_by_which_value=0)
            qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise()
            qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise()
            del twoSNPData1, snpData_imputed
        else:
            snpData_imputed = None
            #qcdata.row_id2NA_mismatch_rate1 = {}
            #qcdata.col_id2NA_mismatch_rate1 = {}
        del newSnpData
        """
		for i in range(len(snpsd_250k_tmp)):
			#snpsd_250k_tmp_1[i].snps = []	#clear it up
			
			if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5:	#not enough for imputation
				npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0)
				imputeData(npute_data_struc, int(npute_window_size))
				snpsd_250k_tmp[i].snps = npute_data_struc.snps
				del npute_data_struc
			"""
        qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0
        qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0

        qcdata.min_call_probability = min_call_probability
        qcdata.max_call_mismatch_rate = max_call_mismatch_rate
        qcdata.max_call_NA_rate = max_call_NA_rate
        qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate
        qcdata.max_snp_NA_rate = max_snp_NA_rate
        qcdata.npute_window_size = npute_window_size
        result.append(qcdata)
        return result
示例#21
0
    def run(self):
        """
		2008-09-06
		"""
        if self.debug:
            #for one-node testing purpose
            import pdb
            pdb.set_trace()
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

            picklef = open(self.snps_context_fname)
            snps_context_wrapper = cPickle.load(picklef)
            del picklef
            gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(
                snps_context_wrapper)
            gene_id_ls = gene_id2snps_id_ls.keys()
            gene_id_ls.sort()

            header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
                self.phenotype_fname, turn_into_integer=0)
            phenData = SNPData(
                header=header_phen,
                strain_acc_list=strain_acc_list,
                data_matrix=data_matrix_phen
            )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
            phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)

            other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls,
                                     gene_id_ls=gene_id_ls,
                                     phenData=phenData)
            other_data_pickle = cPickle.dumps(other_data, -1)
            phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1)
            snpData_pickle = cPickle.dumps(snpData, -1)
            sys.exit(2)

        self.communicator = MPI.world.duplicate()
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        free_computing_node_set = Set(free_computing_nodes)
        output_node_rank = self.communicator.size - 1

        if node_rank == 0:
            dstruc = self.inputNodePrepare()
            params_ls = dstruc.params_ls
            #send the output node the phenotype_label_ls
            self.communicator.send(dstruc.output_node_data_pickle,
                                   output_node_rank, 0)
            del dstruc.output_node_data_pickle

            for node in free_computing_nodes:  #send it to the computing_node
                sys.stderr.write(
                    "passing initial data to nodes from %s to %s ... " %
                    (node_rank, node))
                self.communicator.send(dstruc.snpData_pickle, node, 0)
                self.communicator.send(dstruc.other_data_pickle, node, 0)
                sys.stderr.write(".\n")
            del dstruc

        elif node_rank in free_computing_node_set:
            data, source, tag = self.communicator.receiveString(0, 0)
            snpData = cPickle.loads(data)
            del data
            data, source, tag = self.communicator.receiveString(0, 0)
            other_data = cPickle.loads(data)
            del data
            self.phenotype_index_ls = other_data.phenotype_index_ls
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            output_node_data_pickle = cPickle.loads(data)
            phenotype_label_ls = output_node_data_pickle.phenotype_label_ls
            self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls

        self.synchronize()
        if node_rank == 0:
            param_obj = PassingData(params_ls=params_ls,
                                    output_node_rank=output_node_rank,
                                    report=self.report,
                                    counter=0)
            self.inputNode(param_obj,
                           free_computing_nodes,
                           param_generator=params_ls)
            #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1)
        elif node_rank in free_computing_node_set:
            computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \
                     gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData,
                     phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point,
                     test_type=self.test_type)
            self.computing_node(computing_parameter_obj,
                                self.computing_node_handler)
        else:
            self.general_output_node(self.output_dir, self.phenotype_index_ls,
                                     phenotype_label_ls, free_computing_nodes)
        self.synchronize()  #to avoid some node early exits
示例#22
0
	def run(self):
		self.communicator = MPI.world.duplicate()
		node_rank = self.communicator.rank
		free_computing_nodes = range(1, self.communicator.size-1)	#exclude the 1st and last node
		free_computing_node_set = set(free_computing_nodes)
		output_node_rank = self.communicator.size-1
		
		if node_rank == 0:
			if self.debug:
				#for one-node testing purpose
				import pdb
				pdb.set_trace()
			
			db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, 
						schema=self.schema)
			db_250k.setup(create_tables=False)
			session = db_250k.session
			
			# 2010-9-30 get total number of arrays in this CNV method 
			non_duplicate_array_id_ls = CNVMergeAcrossArrays.getNonDuplicateArraysWithHighestMedianIntensity(db_250k, \
										self.cnv_method_id, table_name=Stock_250kDB.CNVArrayCall.table.name)
			non_duplicate_array_id_set = set(non_duplicate_array_id_ls)
			no_of_total_arrays = len(non_duplicate_array_id_ls)
			
			# read in the SNP set with only arrays in the CNV method set
			snpData = SNPData(input_fname=self.input_fname, turn_into_array=1)
			row_index_to_be_kept = []
			for row_id, row_index in snpData.row_id2row_index.iteritems():
				array_id = int(row_id[1])
				if array_id in non_duplicate_array_id_set:
					row_index_to_be_kept.append(row_index)
			snpData = snpData.keepRowsByRowIndex(snpData, row_index_to_be_kept)
			# a map between array_id and its row index in the SNP dataset
			array_id2row_index = {}
			for row_id, row_index in snpData.row_id2row_index.iteritems():
				array_id = int(row_id[1])
				array_id2row_index[array_id] = row_index
			
			# create a map (RBDict) between each CNV and its nearby SNPs
			# get all CNVs from db
			CNVRBdict = self.createCNVRBDict(db_250k, self.cnv_method_id, self.max_CNV_SNP_dist, array_id2row_index = array_id2row_index, \
											snp_id_ls = snpData.col_id_ls)
			snpData.array_id2row_index = array_id2row_index	# passed to computer node later
			
			snpData_pickle = cPickle.dumps(snpData, -1)
			snpData_pickle = zlib.compress(snpData_pickle)
			for node in free_computing_nodes:	#send it to the computing_node
				sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node))
				self.communicator.send(snpData_pickle, node, 0)
				sys.stderr.write(".\n")
			del snpData_pickle
			del snpData
			params_ls = self.generate_params(CNVRBdict,)
		elif node_rank in free_computing_node_set:
			db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, 
						schema=self.schema)
			db_250k.setup(create_tables=False)
			session = db_250k.session
			
			data, source, tag = self.communicator.receiveString(0, 0)
			data = zlib.decompress(data)	# 2010-10-1 decompress
			snpData =  cPickle.loads(data)
			del data
		else:
			pass
		
		self.synchronize()
		if node_rank == 0:
			param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0)
			self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls, message_size=self.message_size)
		elif node_rank in free_computing_node_set:
			computing_parameter_obj = PassingData(snpData=snpData, min_LD_to_output=self.min_LD_to_output, \
									min_MAF=self.min_MAF, discard_perc=self.discard_perc, db_250k=db_250k, \
									array_id2row_index=snpData.array_id2row_index)
			self.computing_node(computing_parameter_obj, self.computing_node_handler)
		else:
			if getattr(self, 'output_fname', None):
				writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t')
			else:
				writer = None
			param_obj = PassingData(writer=writer, is_header_written=False)
			self.output_node(free_computing_nodes, param_obj, self.output_node_handler)
			del writer
		self.synchronize()	#to avoid some node early exits
示例#23
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        db = StockDB.StockDB(drivername=self.drivername,
                             username=self.db_user,
                             password=self.db_passwd,
                             hostname=self.hostname,
                             database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
             data_matrix=data_matrix) #category_list is not used.

        readme = formReadmeObj(sys.argv, self.ad, StockDB.README)
        session.save(readme)

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        from dbSNP2data import dbSNP2data
        snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
            curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
        strain_info_data = self.get_strain_id_info(self.QC_method_id)
        data_matrix = self.get_data_matrix(db,
                                           strain_info_data.strain_id2index,
                                           snp_id2index,
                                           StockDB.Calls.table.name)
        strain_acc_list = [
            strain_info_data.strain_id2acc[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        category_list = [
            strain_info_data.strain_id2category[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        header = ['ecotypeid', 'strainid']
        for snp_id in snp_id_list:
            snp_name, chromosome, position = snp_id2info[snp_id]
            header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug)
        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            #twoSNPData.save_col_wise(session, readme)	#2008-08-18 need to implement a new one for 149SNP
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \
                 twoSNPData.row_id12row_id2, readme)
        if self.commit:
            session.commit()
        else:
            session.rollback()
	def loadDataStructure(self, db_250k=None, association_locus_id=None, association_landscape_type_id=None, \
						locusExtensionDistance=5000,\
						data_dir=None, list_type_id_list=None, gene_annotation_pickleFname=None, \
						snpInfoPickleFname=None, locus_type_id=1, snp_matrix_fname=None, snp_matrix_data_type=None, \
						phenotype_fname=None):
		"""
		2012.11.14
		"""
		sys.stderr.write("Fetching GWAS landscape for association-locus %s, landscape type %s ..."%(association_locus_id, association_landscape_type_id))
		# fetch the associationLocus
		associationLocus = Stock_250kDB.AssociationLocus.get(association_locus_id)
		associationLandscapeType = Stock_250kDB.AssociationLandscapeType.get(association_landscape_type_id)
		
		# fetch all result-peaks
		landscape_gwr_ls = []
		# fetch landscape within this interval
		start = max(1, associationLocus.start-locusExtensionDistance)
		stop = associationLocus.stop + locusExtensionDistance
		pd = PassingData(min_MAF=associationLandscapeType.min_MAF, data_dir=data_dir, \
						need_chr_pos_ls=0, chromosome=associationLocus.chromosome, \
						start=start, stop=stop, report=False)	#report controls whether getResultMethodContent() will report progress.
		association_landscape_id_set = set()
		
		for association_peak in associationLocus.association_peak_ls:
			association_landscape = db_250k.getAssociationLandscape(result_id=association_peak.result_id, association_landscape_type_id=associationLandscapeType.id)
			if association_landscape and association_landscape.id not in association_landscape_id_set:
				association_landscape_id_set.add(association_landscape.id)
				genome_wide_result = db_250k.getResultMethodContent(association_landscape=association_landscape, data_dir=data_dir, \
												construct_chr_pos2index=True, pdata=pd)
				landscape_gwr_ls.append(genome_wide_result)
				sys.stderr.write(" %s%s "%('\x08'*80, len(landscape_gwr_ls)))
		sys.stderr.write("%s landscapes.\n"%(len(landscape_gwr_ls)))
		
		centralLocus = SNPPassingData(chromosome=associationLocus.chromosome, position=start, \
						snps_id=associationLocus.id, start=start, stop=stop,
						fileNamePrefix="")
		
		LD_info = None
		gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(gene_annotation_pickleFname)
		if snpInfoPickleFname:
			snp_info = db_250k.dealWithSNPInfo(snpInfoPickleFname, locus_type_id=locus_type_id)	#2012.3.8
		else:
			snp_info = None
		
		candidate_gene_set = set()
		if list_type_id_list:
			for list_type_id in list_type_id_list:
				candidate_gene_list = db_250k.getGeneList(list_type_id)
				candidate_gene_set |= set(candidate_gene_list)
		
		if snp_matrix_fname and phenotype_fname:
			if snp_matrix_data_type==3:
				matrix_data_type=float		#2009-3-23 for CNV amplitude file
			else:
				matrix_data_type=int
			snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1,\
							matrix_data_type=matrix_data_type)
			if snpData.data_matrix is None:
				sys.stderr.write("Error. snpData.data_matrix is None.\n")
				sys.exit(3)
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(phenotype_fname, turn_into_integer=0)
			
			phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)
			#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, \
																		strain_acc_list_phen, phenData.data_matrix)
			#tricky, using strain_acc_list_phen
			
			#2008-12-05 fake a snp_info for findSNPsInRegion
			DrawSNPRegion.construct_chr_pos2index_forSNPData(snpData, snp_info=snp_info)
			ecotype_info = getEcotypeInfo(db_250k)
		else:
			snpData = None
			phenData = None
			ecotype_info = None
		
		return_data = PassingData(associationLocus=associationLocus, associationLandscapeType=associationLandscapeType, \
								landscape_gwr_ls=landscape_gwr_ls, \
								gene_annotation=gene_annotation, snp_info=snp_info, LD_info=LD_info, \
								candidate_gene_set=candidate_gene_set, snpData=snpData, phenData=phenData,\
								ecotype_info=ecotype_info, centralLocus=centralLocus)
		return return_data
示例#25
0
    def getCNVQCMatrix(self, probe_id2snp_id_ls, snp_id2tup, snpData,
                       SNP2Col_allele, cnvIntensityData):
        """
		2009-2-12
		"""
        sys.stderr.write("Getting CNV QC matricies ...")
        mismatch_matrix = numpy.zeros(
            [len(snpData.row_id_ls),
             len(probe_id2snp_id_ls)], numpy.int)
        mismatch_matrix[:] = -2
        insertion_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int)
        insertion_matrix[:] = -2
        deletion_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int)
        deletion_matrix[:] = -2
        qc_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int)
        qc_matrix[:] = -2

        cnv_probe_ls = probe_id2snp_id_ls.keys()
        cnv_probe_ls.sort()
        cnv_probe2index = dict(zip(cnv_probe_ls, range(len(cnv_probe_ls))))

        total_disp_pos_ls = []
        total_intensity_ls = []
        total_mismatch_ls = []
        total_insertion_ls = []
        total_deletion_ls = []
        total_mis_ls = []
        for i in range(mismatch_matrix.shape[0]):
            row_id = snpData.row_id_ls[i]
            if row_id in cnvIntensityData.row_id2row_index:
                cnv_row_index = cnvIntensityData.row_id2row_index[row_id]
                for probe_id, snp_id_ls in probe_id2snp_id_ls.iteritems():
                    col_index = cnv_probe2index[probe_id]
                    probe_id_label = '%s_%s' % (probe_id[0], probe_id[1])
                    cnv_col_index = cnvIntensityData.col_id2col_index[
                        probe_id_label]

                    no_of_mismatches = 0
                    no_of_deletions = 0
                    no_of_insertions = 0
                    is_this_probe_NA = 1
                    disp_pos_ls = []
                    for snp_id, disp_pos in snp_id_ls:
                        snp_id_tup = snp_id2tup[snp_id]
                        disp_pos_ls.append(disp_pos)
                        snp_col_index = snpData.col_id2col_index[snp_id]
                        allele = snpData.data_matrix[i][snp_col_index]
                        col_allele = SNP2Col_allele[snp_id]
                        if allele == -2 or allele == 0:
                            continue
                        else:
                            is_this_probe_NA = 0
                            if snp_id_tup[2] != 0:  #the offset is not 0
                                if allele != -1:  #if it's deleted, then it's nothing
                                    no_of_insertions += 1
                            elif allele == -1:
                                no_of_deletions += 1
                            elif col_allele == -2 or col_allele == 0:
                                sys.stderr.write("allele for this accession %s at snp %s is %s while reference allele is NA: %s.\n"%\
                                    (snpData.row_id_ls[i], snp_id, allele, col_allele))
                            elif allele != col_allele:
                                no_of_mismatches += 1
                    if not is_this_probe_NA:
                        mean_disp_pos = numpy.mean(disp_pos_ls)
                        mismatch_matrix[i][col_index] = no_of_mismatches
                        insertion_matrix[i][col_index] = no_of_insertions
                        deletion_matrix[i][col_index] = no_of_deletions
                        total_mis_count = no_of_mismatches + no_of_insertions + no_of_deletions
                        qc_matrix[i][col_index] = total_mis_count
                        total_disp_pos_ls.append(mean_disp_pos)
                        total_intensity_ls.append(
                            cnvIntensityData.data_matrix[cnv_row_index]
                            [cnv_col_index])
                        total_mismatch_ls.append(no_of_mismatches)
                        total_insertion_ls.append(no_of_insertions)
                        total_deletion_ls.append(no_of_deletions)
                        total_mis_ls.append(total_mis_count)
        plotData = PassingData(total_disp_pos_ls=total_disp_pos_ls, total_intensity_ls=total_intensity_ls,\
             total_mismatch_ls=total_mismatch_ls, total_insertion_ls=total_insertion_ls, total_deletion_ls=total_deletion_ls,\
             total_mis_ls=total_mis_ls)
        mismatchData = SNPData(row_id_ls=snpData.row_id_ls,
                               col_id_ls=cnv_probe_ls,
                               data_matrix=mismatch_matrix)
        insertionData = SNPData(row_id_ls=snpData.row_id_ls,
                                col_id_ls=cnv_probe_ls,
                                data_matrix=insertion_matrix)
        deletionData = SNPData(row_id_ls=snpData.row_id_ls,
                               col_id_ls=cnv_probe_ls,
                               data_matrix=deletion_matrix)
        qcData = SNPData(row_id_ls=snpData.row_id_ls,
                         col_id_ls=cnv_probe_ls,
                         data_matrix=qc_matrix)
        sys.stderr.write("Done.\n")
        return PassingData(mismatchData=mismatchData,
                           insertionData=insertionData,
                           deletionData=deletionData,
                           qcData=qcData,
                           plotData=plotData)
示例#26
0
    def outputArray(cls, session, curs, output_dir=None, array_info_table=None, snps=None, \
       probes=None, array_id_ls=[], \
       xy_ls=[], chr_pos_ls=[], probes_id_ls=[],\
       call_method_id=0, run_type=1, array_file_directory=None, outputCNVIntensity=True,\
       returnArrayIntensityData=False):
        """
		2010-5-10
			curs could be elixirdb.metadata.bind or MySQLdb.connect
		2010-5-5
			changed to classmethod
			add argument outputCNVIntensity: whether to output CNV intensity data, default=True.
				returnArrayIntensityData: whether return array CNV intensity data in a SNPData structure
		2009-10-9
			add argument array_file_directory.
		2009-3-11
			add run_type=3
				calculate intensity medium of all probes in the array and store the value in db
			array_id_ls is a list of array_ids in str type
		2009-3-5
			skip if no probes (if one_snp.probes_id_ls == [-1]*4:) for that SNP (fake SNP in the SNP table)
		2008-12-09
			add option run_type
		2008-07-12
			add option array_id
		2008-04-08
		"""
        sys.stderr.write("Outputting arrays ... \n")
        import rpy
        rpy.r.library('affy')
        array_width = None
        if run_type != 3 and output_dir and not os.path.isdir(
                output_dir):  #2010-5-5 test if output_dir is something
            os.makedirs(output_dir)

        sql_query = cls.generateSQLQueryToGetArrays(array_info_table, array_id_ls=array_id_ls, \
                  call_method_id=call_method_id, run_type=run_type)
        print sql_query
        rows = curs.execute(sql_query)
        is_elixirdb = 1  # 2010-5-10 By default, assume curs is elixirdb.metadata.bind
        if hasattr(curs, 'fetchall'):  # 2010-5-10 curs is MySQLdb.connect
            rows = curs.fetchall()
            is_elixirdb = 0
            no_of_objects = len(rows)
        else:
            no_of_objects = int(rows.rowcount)

        if run_type == 2:  #2008-12-09 don't initialize the data_matrix if run_type is not 2 (CNV probe).
            data_matrix = numpy.zeros([len(probes_id_ls), no_of_objects],
                                      numpy.float32)
        array_id_avail_ls = []
        array_label_ls = []
        i = 0
        for row in rows:
            if is_elixirdb:
                array_id = row.array_id
                filename = row.filename
                ecotype_id = row.maternal_ecotype_id
            else:
                array_id, filename, ecotype_id = row[:3]
            array_id_avail_ls.append(array_id)
            array_label_ls.append('%s_%s' % (array_id, ecotype_id))

            if array_file_directory and os.path.isdir(array_file_directory):
                filename = os.path.join(array_file_directory,
                                        os.path.split(filename)[1])

            sys.stderr.write("\t%d/%d: Extracting intensity from %s ... \n" %
                             (i + 1, no_of_objects, filename))

            if run_type == 1:  #output SNP probe intensity within the loop
                output_fname = os.path.join(
                    output_dir, '%s_array_intensity.tsv' % (array_id))
                if os.path.isfile(output_fname):
                    sys.stderr.write("\tFile %s already exists. Ignore.\n" %
                                     (output_fname))
                    continue

            #read array by calling R
            if array_width == None:
                returnData = cls.getArrayWidth(filename)
                intensity_array = returnData.intensity_array
                array = returnData.array
                array_width = returnData.array_width
            else:
                array = rpy.r.read_affybatch(filenames=filename)
                intensity_array = rpy.r.intensity(
                    array)  #return a lengthX1 2-Dimensional array.

            if run_type == 2:  #CNV probe
                for j in range(len(xy_ls)):
                    xpos, ypos = xy_ls[j]
                    #chromosome, position = chr_pos_ls[j]
                    intensity_array_index = array_width * (array_width - xpos -
                                                           1) + ypos
                    #output_row = [chromosome, position]
                    intensity = math.log10(
                        intensity_array[intensity_array_index][0])
                    #output_row.append(intensity)
                    #writer.writerow(output_row)
                    data_matrix[j][i] = intensity
            elif run_type == 1:  #SNP probe intensity
                writer = csv.writer(open(output_fname, 'w'), delimiter='\t')
                header = ['sense1', 'sense2', 'antisense1', 'antisense2']

                func = lambda x: '%s_%s' % (array_id, x)
                header = map(func, header)
                header = ['SNP_ID'] + header
                writer.writerow(header)
                for snps_id in snps.snps_id_ls:
                    one_snp = snps.get_one_snp(snps_id)
                    output_row = [one_snp.snpid]
                    if one_snp.probes_id_ls == [
                            -1
                    ] * 4:  #2009-3-5 skip if no probes for that SNP (fake SNP in the SNP table)
                        continue
                    for probes_id in one_snp.probes_id_ls:
                        one_probe = probes.get_one_probe(probes_id)
                        intensity_array_index = array_width * (
                            array_width - one_probe.xpos - 1) + one_probe.ypos
                        output_row.append(
                            intensity_array[intensity_array_index][0])
                    writer.writerow(output_row)
                del writer
            elif run_type == 3:  #calculate the intensity medium of all probes and store into db
                median_intensity = numpy.median(intensity_array)
                array_info_entry = Stock_250kDB.ArrayInfo.get(array_id)
                array_info_entry.median_intensity = median_intensity
                session.add(array_info_entry)
            else:
                sys.stderr.write("Error: run_type %s is not supported.\n" %
                                 run_type)
                sys.exit(3)

            del intensity_array, array
            i += 1

        if run_type == 2 and outputCNVIntensity:
            #2008-11-13 output in Roger's multi-sample format
            header = ['probes_id'
                      ] + array_id_avail_ls + ['chromosome', 'position']
            output_fname = os.path.join(
                output_dir,
                'call_method_%s_CNV_intensity.tsv' % (call_method_id))

            writer = csv.writer(open(output_fname, 'w'), delimiter='\t')
            writer.writerow(header)
            for i in range(data_matrix.shape[0]):
                data_row = [probes_id_ls[i]] + list(data_matrix[i]) + list(
                    chr_pos_ls[i])
                writer.writerow(data_row)
            del writer
        sys.stderr.write("Done.\n")
        if returnArrayIntensityData:  #2010-5-5
            arrayIntensityData = SNPData(row_id_ls=xy_ls,
                                         col_id_ls=array_label_ls,
                                         data_matrix=data_matrix)
            return arrayIntensityData
示例#27
0
    def prepareTwoSNPData(self,
                          db,
                          max_mismatch_rate=0.25,
                          min_no_of_non_NA_pairs=40,
                          report=0):
        """
		2009-9-23
			add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData.
			However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is
			not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 
		2008-09-10
			if self.input_fname is given, get 149SNP data from it , instead of database
		2008-8-28
			split out of run() so that MpiQC149CrossMatch could call this easily
		"""
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        if self.input_fname:
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
        else:
            from dbSNP2data import dbSNP2data
            snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
                curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
            strain_info_data = self.get_strain_id_info(
                self.QC_method_id, ignore_strains_with_qc=False)
            data_matrix = self.get_data_matrix(
                db, strain_info_data.strain_id2index, snp_id2index,
                StockDB.Calls.table.name)
            strain_acc_list = [
                strain_info_data.strain_id2acc[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #tg_ecotypeid
            category_list = [
                strain_info_data.strain_id2category[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #strainid
            header = ['ecotypeid', 'strainid']
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table
        if self.QC_method_id == 4:
            snpData2 = snpData1
        else:
            self.cmp_data_filename = self.findOutCmpDataFilename(
                self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix) #category_list is not used to facilitate row-id matching


        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report)
        return twoSNPData
示例#28
0
    def run(self):
        """
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""
        #database connection and etc
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, self.QCMethod_class)
        qm = self.QCMethod_class.query.get(self.QC_method_id)  #2009-5-20

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs
        if self.debug:
            import pdb
            pdb.set_trace()

        readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README)
        session.add(readme)

        QC_method_id2snps_table = self.QC_method_id2snps_table

        if self.QC_method_id == 0:
            self.cal_independent_NA_rate(db, self.min_probability, readme)
            row_id2NA_mismatch_rate = None
        else:
            #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename, ignore_het=qm.ignore_het)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id),\
                ignore_het=qm.ignore_het) #category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided.
            """
			if self.input_dir and os.path.isdir(self.input_dir):
				#04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
				#no submission to db
				call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
			"""
            if self.input_dir and os.path.isfile(self.input_dir):  #it's file
                call_info_id2fname = None
            else:
                if self.run_type == 2:  #no filtering on call_info entries that have been QCed.
                    filter_calls_QCed = 0
                elif self.run_type == 1:
                    filter_calls_QCed = 1
                    self.max_call_info_mismatch_rate = 1  #don't use this when doing accession-wise QC
                else:
                    sys.stderr.write("run_type=%s is not supported.\n" %
                                     self.run_type)
                    sys.exit(5)
                call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
                          filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug,\
                          min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir)
                call_info_id2fname = call_data.call_info_id2fname
                call_info_ls_to_return = call_data.call_info_ls_to_return
            if self.run_type == 2:
                snps_name2snps_id = self.get_snps_name2snps_id(db)
            else:
                snps_name2snps_id = None

            if call_info_id2fname:
                db_id2chr_pos = db.getSNPID2ChrPos()  #2011-22
                from DB_250k2data import DB_250k2Data
                db_id2index = DB_250k2Data.getSNPID2index(
                    call_info_id2fname.values()[0][1], db_id2chr_pos)
                if self.one_by_one and self.run_type == 1:  #one_by_one only for QC by accession
                    row_id2NA_mismatch_rate = {}
                    row_id12row_id2 = {}
                    counter = 0
                    for call_info_id, value in call_info_id2fname.iteritems():
                        counter += 1
                        print "No", counter
                        tmp_dict = {}
                        tmp_dict[call_info_id] = value
                        pdata = self.read_call_matrix(
                            tmp_dict,
                            self.min_probability,
                            db_id2chr_pos=db_id2chr_pos,
                            db_id2index=db_id2index)
                        #05/20/09 no need for qm.ignore_het because 250k is all h**o
                        passingdata = self.qcDataMatrixVSsnpData(
                            pdata, snps_name2snps_id, snpData2, curs, session,
                            readme)
                        row_id2NA_mismatch_rate.update(
                            passingdata.row_id2NA_mismatch_rate)
                        row_id12row_id2.update(passingdata.row_id12row_id2)
                        del pdata

                        if self.debug and counter == 10:
                            break
                else:
                    pdata = self.read_call_matrix(call_info_id2fname,
                                                  self.min_probability,
                                                  db_id2chr_pos=db_id2chr_pos,
                                                  db_id2index=db_id2index)
                    #05/20/09 no need for qm.ignore_het because 250k is all h**o
                    passingdata = self.qcDataMatrixVSsnpData(
                        pdata, snps_name2snps_id, snpData2, curs, session,
                        readme)
                    row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                    row_id12row_id2 = passingdata.row_id12row_id2
                    del pdata
            else:
                #input file is SNP by strain format. double header (1st two lines)
                header, snps_name_ls, category_list, data_matrix = read_data(
                    self.input_dir, double_header=1, ignore_het=qm.ignore_het)
                pdata = PassingData()
                pdata.ecotype_id_ls = header[0][2:]
                pdata.call_info_id_ls = header[1][2:]
                data_matrix = numpy.array(data_matrix)
                pdata.data_matrix = data_matrix.transpose()
                pdata.header = ['', ''
                                ] + snps_name_ls  #fake a header for SNPData
                passingdata = self.qcDataMatrixVSsnpData(
                    pdata, snps_name2snps_id, snpData2, curs, session, readme)
                row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                row_id12row_id2 = passingdata.row_id12row_id2
                del pdata

        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, \
                 row_id12row_id2, self.call_method_id, readme)
        if self.commit:
            curs.execute("commit")
            session.commit()
        else:
            session.rollback()

        self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate  #for plone to get the data structure
示例#29
0
    def run(self):
        self.communicator = MPI.world.duplicate()
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        free_computing_node_set = Set(free_computing_nodes)
        output_node_rank = self.communicator.size - 1
        """
		if node_rank!=output_node_rank:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used to facilitate row-id matching
		"""
        if node_rank == 0:
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix) #category_list is not used to facilitate row-id matching
            snpData_pickle = cPickle.dumps(snpData, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                sys.stderr.write(
                    "passing initial data to nodes from %s to %s ... " %
                    (node_rank, node))
                self.communicator.send(snpData_pickle, node, 0)
                sys.stderr.write(".\n")
            del snpData_pickle
            params_ls = self.generate_params(len(snpData.col_id_ls),
                                             self.block_size)
            del snpData
        elif node_rank in free_computing_node_set:
            data, source, tag = self.communicator.receiveString(0, 0)
            snpData = cPickle.loads(data)
            del data
        else:
            pass

        self.synchronize()
        if node_rank == 0:
            param_obj = PassingData(params_ls=params_ls,
                                    output_node_rank=output_node_rank,
                                    report=self.report,
                                    counter=0)
            self.inputNode(param_obj,
                           free_computing_nodes,
                           param_generator=params_ls)
            #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_handler, message_size=1)
            #self.input_node(param_obj, free_computing_nodes, self.message_size)
        elif node_rank in free_computing_node_set:
            computing_parameter_obj = PassingData(
                snpData=snpData,
                min_LD_to_output=self.min_LD_to_output,
                min_MAF=self.min_MAF,
                discard_perc=self.discard_perc)
            self.computing_node(computing_parameter_obj,
                                self.computing_node_handler)
        else:
            if getattr(self, 'output_fname', None):
                writer = csv.writer(open(self.output_fname, 'w'),
                                    delimiter='\t')
                #header_row = ['snp1_id', 'snp2_id', 'r2', 'D', "D'", "no_of_pairs"]
                #writer.writerow(header_row)
            else:
                writer = None
            param_obj = PassingData(writer=writer, is_header_written=False)
            self.output_node(free_computing_nodes, param_obj,
                             self.output_node_handler)
            del writer
        self.synchronize()  #to avoid some node early exits
示例#30
0
    def plone_run(self, min_call_info_mismatch_rate=0.1):
        """
		2009-6-9
			pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db.
		2009-4-13
			add min_call_info_mismatch_rate
		2009-2-5
			add "create_tables=False" to db.setup()
		2008-07-02
			fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null.
		2008-07-01
			adjust to the newest functions in QC_250k.py
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs

        #database connection and etc
        db = Stock_250kDB.Stock_250kDB(username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()
        # if cmp_data_filename not specified, try to find in the data_description column in table QC_method.
        qm = QCMethod.query.get(self.QC_method_id)
        if not self.cmp_data_filename and self.QC_method_id != 0:
            if qm.data_description:
                data_description_ls = qm.data_description.split('=')
                if len(data_description_ls) > 1:
                    self.cmp_data_filename = qm.data_description.split(
                        '=')[1].strip()

        #after db query, cmp_data_filename is still nothing, exit program.
        if not self.cmp_data_filename and self.QC_method_id != 0:
            sys.stderr.write(
                "cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n"
            )
            sys.exit(3)

        #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het)
        #category_list is not used.

        if self.input_dir:
            #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #no submission to db
            call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
        else:
            #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id)
            call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
             filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\
             debug=self.debug)
            call_info_id2fname = call_data.call_info_id2fname
            call_info_ls_to_return = call_data.call_info_ls_to_return

        #2008-07-01 pick the call_info_ids to be handled
        new_call_info_id2fname = {}
        for call_info_id_wanted in self.call_info_id_ls:
            if call_info_id_wanted in call_info_id2fname:
                new_call_info_id2fname[
                    call_info_id_wanted] = call_info_id2fname[
                        call_info_id_wanted]
            elif self.report:
                sys.stderr.write("%s not in call_info_id2fname.\n" %
                                 (call_info_id_wanted))
        call_info_id2fname = new_call_info_id2fname

        if call_info_id2fname:
            pdata = self.read_call_matrix(call_info_id2fname,
                                          self.min_probability)
            header = pdata.header
            call_info_id_ls = pdata.call_info_id_ls
            array_id_ls = pdata.array_id_ls
            ecotype_id_ls = pdata.ecotype_id_ls
            data_matrix = pdata.data_matrix
        elif self.input_dir:  #2008-07-02
            #input file is SNP by strain format. double header (1st two lines)
            header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data(
                self.input_dir, double_header=1)
            ecotype_id_ls = header[0][2:]
            call_info_id_ls = header[1][2:]
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix.transpose()
            header = ['', ''] + snps_name_ls  #fake a header for SNPData
        else:  #2008-07-02
            sys.stderr.write("No good arrays.\n")
            return None

        snps_name2snps_id = None

        #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
        snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \
            min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
            max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs)
        #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db.

        row_id2NA_mismatch_rate = None

        #2008-05-01 create a cross match table temporarily
        twoSNPData.qc_cross_match_table = 'qc_cross_match'
        twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table
        twoSNPData.cal_row_id2pairwise_dist(
        )  #database submission is done along.
        return row_id2NA_mismatch_rate