예제 #1
0
	def get_data_matrix(self, db, phenotype_info, list_type_analysis_method_info, where_condition):
		sys.stderr.write("Getting data matrix ...")
		data_matrix = num.zeros([len(list_type_analysis_method_info.list_type_id_analysis_method_id2index), len(phenotype_info.phenotype_method_id2index)], num.float)
		data_matrix[:] = -1
		i = 0
		rows = db.metadata.bind.execute("select r.analysis_method_id, r.phenotype_method_id, c.* from %s order by analysis_method_id"\
								%(where_condition))
		min_value = None
		max_value = None
		for row in rows:
			tup = (row.list_type_id, row.analysis_method_id)
			row_index = list_type_analysis_method_info.list_type_id_analysis_method_id2index[tup]
			col_index = phenotype_info.phenotype_method_id2index[row.phenotype_method_id]
			if row.pvalue>0:
				data_value = -math.log10(row.pvalue)
				if min_value==None:
					min_value = data_value
				elif data_value<min_value:
					min_value = data_value
				
				if max_value==None:
					max_value=data_value
				elif data_value>max_value:
					max_value =data_value
			else:
				data_value = -2	#0 pvalue
			data_matrix[row_index, col_index] = data_value
		sys.stderr.write("Done.\n")
		return_data = PassingData()
		return_data.data_matrix = data_matrix
		return_data.min_value = min_value
		return_data.max_value = max_value
		return return_data
예제 #2
0
    def get_data_matrix(self,
                        db,
                        strain_id_info,
                        target_id_info,
                        QC_method_id,
                        max_mismatch_rate,
                        min_no_of_non_NAs=20):
        """
		2008-08-29
		"""
        sys.stderr.write("Getting data matrix ... \n")
        data_matrix = num.zeros([
            len(strain_id_info.strain_id_ls),
            len(target_id_info.strain_id_ls)
        ], num.float)
        data_matrix[:] = -1
        i = 0
        block_size = 10000
        query = StockDB.QCCrossMatch.query.filter_by(
            qc_method_id=QC_method_id).filter(
                StockDB.QCCrossMatch.no_of_non_NA_pairs > min_no_of_non_NAs
            ).filter(StockDB.QCCrossMatch.mismatch_rate <= max_mismatch_rate)
        rows = query.offset(i).limit(block_size)
        min_value = None
        max_value = None
        while rows.count() != 0:
            for row in rows:
                row_index = strain_id_info.strain_id2index[row.strainid]
                col_index = target_id_info.strain_id2index[row.target_id]
                data_value = row.mismatch_rate
                if data_value >= 0:
                    if min_value == None:
                        min_value = data_value
                    elif data_value < min_value:
                        min_value = data_value

                if max_value == None:
                    max_value = data_value
                elif data_value > max_value:
                    max_value = data_value
                data_matrix[row_index, col_index] = data_value
                i += 1
            if self.report:
                sys.stderr.write("%s\t%s" % ('\x08' * 40, i))
            rows = query.offset(i).limit(block_size)
        sys.stderr.write("Done.\n")
        return_data = PassingData()
        return_data.data_matrix = data_matrix
        return_data.min_value = min_value
        return_data.max_value = max_value
        return return_data
    def get_data_matrix(self, db, phenotype_info,
                        list_type_analysis_method_info, where_condition):
        sys.stderr.write("Getting data matrix ...")
        data_matrix = num.zeros([
            len(list_type_analysis_method_info.
                list_type_id_analysis_method_id2index),
            len(phenotype_info.phenotype_method_id2index)
        ], num.float)
        data_matrix[:] = -1
        i = 0
        rows = db.metadata.bind.execute("select r.analysis_method_id, r.phenotype_method_id, c.* from %s order by analysis_method_id"\
              %(where_condition))
        min_value = None
        max_value = None
        for row in rows:
            tup = (row.list_type_id, row.analysis_method_id)
            row_index = list_type_analysis_method_info.list_type_id_analysis_method_id2index[
                tup]
            col_index = phenotype_info.phenotype_method_id2index[
                row.phenotype_method_id]
            if row.pvalue > 0:
                data_value = -math.log10(row.pvalue)
                if min_value == None:
                    min_value = data_value
                elif data_value < min_value:
                    min_value = data_value

                if max_value == None:
                    max_value = data_value
                elif data_value > max_value:
                    max_value = data_value
            else:
                data_value = -2  #0 pvalue
            data_matrix[row_index, col_index] = data_value
        sys.stderr.write("Done.\n")
        return_data = PassingData()
        return_data.data_matrix = data_matrix
        return_data.min_value = min_value
        return_data.max_value = max_value
        return return_data
예제 #4
0
	def get_data_matrix(self, db, strain_id_info, target_id_info,  QC_method_id, max_mismatch_rate, min_no_of_non_NAs=20):
		"""
		2008-08-29
		"""
		sys.stderr.write("Getting data matrix ... \n")
		data_matrix = num.zeros([len(strain_id_info.strain_id_ls), len(target_id_info.strain_id_ls)], num.float)
		data_matrix[:] = -1
		i = 0
		block_size = 10000
		query = StockDB.QCCrossMatch.query.filter_by(qc_method_id=QC_method_id).filter(StockDB.QCCrossMatch.no_of_non_NA_pairs>min_no_of_non_NAs).filter(StockDB.QCCrossMatch.mismatch_rate<=max_mismatch_rate)
		rows = query.offset(i).limit(block_size)
		min_value = None
		max_value = None
		while rows.count()!=0:
			for row in rows:
				row_index = strain_id_info.strain_id2index[row.strainid]
				col_index = target_id_info.strain_id2index[row.target_id]
				data_value = row.mismatch_rate
				if data_value>=0:
					if min_value==None:
						min_value = data_value
					elif data_value<min_value:
						min_value = data_value
				
				if max_value==None:
					max_value=data_value
				elif data_value>max_value:
					max_value =data_value
				data_matrix[row_index, col_index] = data_value
				i += 1
			if self.report:
				sys.stderr.write("%s\t%s"%('\x08'*40, i))
			rows = query.offset(i).limit(block_size)
		sys.stderr.write("Done.\n")
		return_data = PassingData()
		return_data.data_matrix = data_matrix
		return_data.min_value = min_value
		return_data.max_value = max_value
		return return_data
예제 #5
0
	def getDataMatrix(self, results_method_id2gene_set, results_method_id_info):
		sys.stderr.write("Gettiing data matrix ...")
		data_matrix = num.zeros([len(results_method_id_info.results_method_id_ls), len(results_method_id_info.results_method_id_ls)], num.float)
		data_matrix[:] = -1
		min_value = None
		max_value = None
		no_of_results = len(results_method_id_info.results_method_id_ls) 
		for i in range(no_of_results):
			results_method_id1 = results_method_id_info.results_method_id_ls[i]
			for j in range(i, no_of_results):
				results_method_id2 = results_method_id_info.results_method_id_ls[j]
				row_index = results_method_id_info.results_method_id2index.get(results_method_id1)
				col_index = results_method_id_info.results_method_id2index.get(results_method_id2)
				if results_method_id1<0 or results_method_id2<0:
					data_value = -3
				elif results_method_id1 in results_method_id2gene_set and results_method_id2 in results_method_id2gene_set:
					data_value = len(results_method_id2gene_set[results_method_id1]&results_method_id2gene_set[results_method_id2])
					if min_value==None:
						min_value = data_value
					elif data_value<min_value:
						min_value = data_value
					
					if max_value==None:
						max_value=data_value
					elif data_value>max_value:
						max_value =data_value
				else:
					continue
					#data_value = -1
				data_matrix[row_index, col_index] = data_value
				data_matrix[col_index, row_index] = data_value
		return_data = PassingData()
		return_data.data_matrix = data_matrix
		return_data.min_value = min_value
		return_data.max_value = max_value
		sys.stderr.write("Done.\n")
		return return_data
예제 #6
0
    def run(self):
        """
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""
        # database connection and etc
        db = Stock_250kDB.Stock_250kDB(
            drivername=self.drivername,
            username=self.user,
            password=self.passwd,
            hostname=self.hostname,
            database=self.dbname,
        )
        db.setup()
        session = db.session
        session.begin()
        # transaction = session.create_transaction()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, self.QCMethod_class
        )
        qm = self.QCMethod_class.query.get(self.QC_method_id)  # 2009-5-20

        import MySQLdb

        conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs
        if self.debug:
            import pdb

            pdb.set_trace()

        readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README)
        session.save(readme)

        QC_method_id2snps_table = self.QC_method_id2snps_table

        if self.QC_method_id == 0:
            self.cal_independent_NA_rate(db, self.min_probability, readme)
            row_id2NA_mismatch_rate = None
        else:
            # from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename, ignore_het=qm.ignore_het
            )
            strain_acc_list = map(
                int, strain_acc_list
            )  # it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(
                header=header,
                strain_acc_list=strain_acc_list,
                data_matrix=data_matrix,
                snps_table=QC_method_id2snps_table.get(self.QC_method_id),
                ignore_het=qm.ignore_het,
            )  # category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided.
            """
			if self.input_dir and os.path.isdir(self.input_dir):
				#04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
				#no submission to db
				call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
			"""
            if self.input_dir and os.path.isfile(self.input_dir):  # it's file
                call_info_id2fname = None
            else:
                if self.run_type == 2:  # no filtering on call_info entries that have been QCed.
                    filter_calls_QCed = 0
                elif self.run_type == 1:
                    filter_calls_QCed = 1
                    self.max_call_info_mismatch_rate = 1  # don't use this when doing accession-wise QC
                else:
                    sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
                    sys.exit(5)
                call_data = self.get_call_info_id2fname(
                    db,
                    self.QC_method_id,
                    self.call_method_id,
                    filter_calls_QCed,
                    self.max_call_info_mismatch_rate,
                    self.debug,
                    min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs,
                    input_dir=self.input_dir,
                )
                call_info_id2fname = call_data.call_info_id2fname
                call_info_ls_to_return = call_data.call_info_ls_to_return
            if self.run_type == 2:
                snps_name2snps_id = self.get_snps_name2snps_id(db)
            else:
                snps_name2snps_id = None

            if call_info_id2fname:
                if self.one_by_one and self.run_type == 1:  # one_by_one only for QC by accession
                    row_id2NA_mismatch_rate = {}
                    row_id12row_id2 = {}
                    counter = 0
                    for call_info_id, value in call_info_id2fname.iteritems():
                        counter += 1
                        print "No", counter
                        tmp_dict = {}
                        tmp_dict[call_info_id] = value
                        pdata = self.read_call_matrix(
                            tmp_dict, self.min_probability
                        )  # 05/20/09 no need for qm.ignore_het because 250k is all h**o
                        passingdata = self.qcDataMatrixVSsnpData(
                            pdata, snps_name2snps_id, snpData2, curs, session, readme
                        )
                        row_id2NA_mismatch_rate.update(passingdata.row_id2NA_mismatch_rate)
                        row_id12row_id2.update(passingdata.row_id12row_id2)
                        del pdata

                        if self.debug and counter == 10:
                            break
                else:
                    pdata = self.read_call_matrix(
                        call_info_id2fname, self.min_probability
                    )  # 05/20/09 no need for qm.ignore_het because 250k is all h**o
                    passingdata = self.qcDataMatrixVSsnpData(pdata, snps_name2snps_id, snpData2, curs, session, readme)
                    row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                    row_id12row_id2 = passingdata.row_id12row_id2
                    del pdata
            else:
                # input file is SNP by strain format. double header (1st two lines)
                header, snps_name_ls, category_list, data_matrix = read_data(
                    self.input_dir, double_header=1, ignore_het=qm.ignore_het
                )
                pdata = PassingData()
                pdata.ecotype_id_ls = header[0][2:]
                pdata.call_info_id_ls = header[1][2:]
                data_matrix = numpy.array(data_matrix)
                pdata.data_matrix = data_matrix.transpose()
                pdata.header = ["", ""] + snps_name_ls  # fake a header for SNPData
                passingdata = self.qcDataMatrixVSsnpData(pdata, snps_name2snps_id, snpData2, curs, session, readme)
                row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                row_id12row_id2 = passingdata.row_id12row_id2
                del pdata

        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            # if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            # row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(
                session,
                row_id2NA_mismatch_rate,
                self.QC_method_id,
                self.user,
                self.min_probability,
                row_id12row_id2,
                self.call_method_id,
                readme,
            )
        if self.commit:
            curs.execute("commit")
            session.commit()
        else:
            session.rollback()

        self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate  # for plone to get the data structure
예제 #7
0
	def get_data_matrixFromFile(self, db, strain_id_info, target_id_info,  QC_method_id, input_fname, max_mismatch_rate, min_no_of_non_NAs=20):
		"""
		2008-09-10
			column in input_fname is determined on the fly
		"""
		sys.stderr.write("Getting data matrix from  %s ... \n"%input_fname)
		data_matrix = num.zeros([len(strain_id_info.strain_id_ls), len(target_id_info.strain_id_ls)], num.float)
		data_matrix[:] = -1
		reader = csv.reader(open(input_fname), delimiter='\t')
		#figure out which variable is in which column
		header = reader.next()
		col_name2index = {}
		for i in range(len(header)):
			column_name = header[i]
			col_name2index[column_name] = i
		min_value = None
		max_value = None
		i = 0
		for row in reader:
			"""
			id, strainid, target_id, qc_method_id, mismatch_rate, no_of_mismatches, no_of_non_NA_pairs, readme_id =row
			strainid = int(strainid)
			target_id = int(target_id)
			qc_method_id = int(qc_method_id)
			mismatch_rate = float(mismatch_rate)
			no_of_mismatches = int(no_of_mismatches)
			no_of_non_NA_pairs = int(no_of_non_NA_pairs)
			"""
			strainid = int(row[col_name2index['strainid']])
			target_id = int(row[col_name2index['target_id']])
			qc_method_id = int(row[col_name2index['qc_method_id']])
			mismatch_rate = float(row[col_name2index['mismatch_rate']])
			no_of_mismatches = int(row[col_name2index['no_of_mismatches']])
			no_of_non_NA_pairs = int(row[col_name2index['no_of_non_NA_pairs']])
			if qc_method_id == QC_method_id and no_of_non_NA_pairs>=min_no_of_non_NAs and mismatch_rate<=max_mismatch_rate:
				row_index = strain_id_info.strain_id2index.get(strainid)
				col_index = target_id_info.strain_id2index.get(target_id)
				if row_index is None or col_index is None:
					continue
				data_value = mismatch_rate
				if data_value>=0:
					if min_value==None:
						min_value = data_value
					elif data_value<min_value:
						min_value = data_value
				
				if max_value==None:
					max_value=data_value
				elif data_value>max_value:
					max_value =data_value
				data_matrix[row_index, col_index] = data_value
				if QC_method_id==4:	#149 self-cross-match
					row_index = strain_id_info.strain_id2index.get(target_id)
					col_index = target_id_info.strain_id2index.get(strainid)
					data_matrix[row_index, col_index] = data_value
			i += 1
			if self.report and i%100000==0:
				sys.stderr.write("%s\t%s"%('\x08'*40, i))
			if self.debug and i>1000000:
				break
		return_data = PassingData()
		return_data.data_matrix = data_matrix
		return_data.min_value = min_value
		return_data.max_value = max_value
		del reader
		sys.stderr.write("Done.\n")
		return return_data
    def get_data_matrix(cls, db, row_info, col_info, from_where_clause, need_other_values=False,\
        null_distribution_type_id=2):
        """
		2008-11-04
			get data_matrix_candidate_sample_size_null & data_matrix_candidate_gw_size_null also if need_other_values=True
		"""
        sys.stderr.write("Getting data matrix ...")
        data_matrix = num.zeros(
            [len(row_info.id2index),
             len(col_info.id2index)], num.float)
        data_matrix[:] = -1
        max_no_of_null_data = 200
        if need_other_values:
            data_matrix_candidate_sample_size = num.zeros(
                [len(row_info.id2index),
                 len(col_info.id2index)], num.float)
            data_matrix_candidate_sample_size[:] = -1
            data_matrix_non_candidate_sample_size = num.zeros(
                [len(row_info.id2index),
                 len(col_info.id2index)], num.float)
            data_matrix_non_candidate_sample_size[:] = -1
            data_matrix_candidate_gw_size = num.zeros(
                [len(row_info.id2index),
                 len(col_info.id2index)], num.float)
            data_matrix_candidate_gw_size[:] = -1
            data_matrix_non_candidate_gw_size = num.zeros(
                [len(row_info.id2index),
                 len(col_info.id2index)], num.float)
            data_matrix_non_candidate_gw_size[:] = -1

            data_matrix_candidate_sample_size_null = num.zeros([
                len(row_info.id2index),
                len(col_info.id2index), max_no_of_null_data
            ], num.float)
            data_matrix_candidate_sample_size_null[:] = -1
            data_matrix_candidate_gw_size_null = num.zeros([
                len(row_info.id2index),
                len(col_info.id2index), max_no_of_null_data
            ], num.float)
            data_matrix_candidate_gw_size_null[:] = -1
        else:
            data_matrix_candidate_sample_size = None
            data_matrix_non_candidate_sample_size = None
            data_matrix_candidate_gw_size = None
            data_matrix_non_candidate_gw_size = None
            data_matrix_candidate_sample_size_null = None
            data_matrix_candidate_gw_size_null = None
        rows = db.metadata.bind.execute(
            "select t.id, t.no_of_top_snps, t.min_distance, t.pvalue, t.candidate_sample_size, \
			t.non_candidate_sample_size, t.candidate_gw_size, t.non_candidate_gw_size %s"
            % from_where_clause)
        min_value = None
        max_value = None
        for row in rows:
            row_index = row_info.id2index[row.no_of_top_snps]
            col_index = col_info.id2index[row.min_distance]
            if row.pvalue > 0:
                data_value = -math.log10(row.pvalue)
                if min_value == None:
                    min_value = data_value
                elif data_value < min_value:
                    min_value = data_value

                if max_value == None:
                    max_value = data_value
                elif data_value > max_value:
                    max_value = data_value
            else:
                data_value = -2  #0 pvalue
            data_matrix[row_index, col_index] = data_value
            if need_other_values:
                data_matrix_candidate_sample_size[
                    row_index, col_index] = row.candidate_sample_size
                data_matrix_non_candidate_sample_size[
                    row_index, col_index] = row.non_candidate_sample_size
                data_matrix_candidate_gw_size[
                    row_index, col_index] = row.candidate_gw_size
                data_matrix_non_candidate_gw_size[
                    row_index, col_index] = row.non_candidate_gw_size

                null_datas = db.metadata.bind.execute("select candidate_sample_size, candidate_gw_size from %s where observed_id=%s and null_distribution_type_id=%s"%\
                       (Stock_250kDB.TopSNPTestRMNullData.table.name, row.id, null_distribution_type_id))
                i = 0
                for null_data in null_datas:
                    data_matrix_candidate_sample_size_null[
                        row_index, col_index,
                        i] = null_data.candidate_sample_size
                    data_matrix_candidate_gw_size_null[
                        row_index, col_index, i] = null_data.candidate_gw_size
                    i += 1
                    if i >= max_no_of_null_data:  #no more than this
                        break

        sys.stderr.write("Done.\n")
        return_data = PassingData()
        return_data.data_matrix = data_matrix
        return_data.data_matrix_candidate_sample_size = data_matrix_candidate_sample_size
        return_data.data_matrix_non_candidate_sample_size = data_matrix_non_candidate_sample_size
        return_data.data_matrix_candidate_gw_size = data_matrix_candidate_gw_size
        return_data.data_matrix_non_candidate_gw_size = data_matrix_non_candidate_gw_size
        return_data.min_value = min_value
        return_data.max_value = max_value
        return_data.data_matrix_candidate_sample_size_null = data_matrix_candidate_sample_size_null
        return_data.data_matrix_candidate_gw_size_null = data_matrix_candidate_gw_size_null
        return return_data
예제 #9
0
    def get_data_matrixFromFile(self,
                                db,
                                strain_id_info,
                                target_id_info,
                                QC_method_id,
                                input_fname,
                                max_mismatch_rate,
                                min_no_of_non_NAs=20):
        """
		2008-09-10
			column in input_fname is determined on the fly
		"""
        sys.stderr.write("Getting data matrix from  %s ... \n" % input_fname)
        data_matrix = num.zeros([
            len(strain_id_info.strain_id_ls),
            len(target_id_info.strain_id_ls)
        ], num.float)
        data_matrix[:] = -1
        reader = csv.reader(open(input_fname), delimiter='\t')
        #figure out which variable is in which column
        header = reader.next()
        col_name2index = {}
        for i in range(len(header)):
            column_name = header[i]
            col_name2index[column_name] = i
        min_value = None
        max_value = None
        i = 0
        for row in reader:
            """
			id, strainid, target_id, qc_method_id, mismatch_rate, no_of_mismatches, no_of_non_NA_pairs, readme_id =row
			strainid = int(strainid)
			target_id = int(target_id)
			qc_method_id = int(qc_method_id)
			mismatch_rate = float(mismatch_rate)
			no_of_mismatches = int(no_of_mismatches)
			no_of_non_NA_pairs = int(no_of_non_NA_pairs)
			"""
            strainid = int(row[col_name2index['strainid']])
            target_id = int(row[col_name2index['target_id']])
            qc_method_id = int(row[col_name2index['qc_method_id']])
            mismatch_rate = float(row[col_name2index['mismatch_rate']])
            no_of_mismatches = int(row[col_name2index['no_of_mismatches']])
            no_of_non_NA_pairs = int(row[col_name2index['no_of_non_NA_pairs']])
            if qc_method_id == QC_method_id and no_of_non_NA_pairs >= min_no_of_non_NAs and mismatch_rate <= max_mismatch_rate:
                row_index = strain_id_info.strain_id2index.get(strainid)
                col_index = target_id_info.strain_id2index.get(target_id)
                if row_index is None or col_index is None:
                    continue
                data_value = mismatch_rate
                if data_value >= 0:
                    if min_value == None:
                        min_value = data_value
                    elif data_value < min_value:
                        min_value = data_value

                if max_value == None:
                    max_value = data_value
                elif data_value > max_value:
                    max_value = data_value
                data_matrix[row_index, col_index] = data_value
                if QC_method_id == 4:  #149 self-cross-match
                    row_index = strain_id_info.strain_id2index.get(target_id)
                    col_index = target_id_info.strain_id2index.get(strainid)
                    data_matrix[row_index, col_index] = data_value
            i += 1
            if self.report and i % 100000 == 0:
                sys.stderr.write("%s\t%s" % ('\x08' * 40, i))
            if self.debug and i > 1000000:
                break
        return_data = PassingData()
        return_data.data_matrix = data_matrix
        return_data.min_value = min_value
        return_data.max_value = max_value
        del reader
        sys.stderr.write("Done.\n")
        return return_data
예제 #10
0
	def get_data_matrix(cls, db, row_info, col_info, from_where_clause, need_other_values=False,\
					null_distribution_type_id=2):
		"""
		2008-11-04
			get data_matrix_candidate_sample_size_null & data_matrix_candidate_gw_size_null also if need_other_values=True
		"""
		sys.stderr.write("Getting data matrix ...")
		data_matrix = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float)
		data_matrix[:] = -1
		max_no_of_null_data = 100
		if need_other_values:
			data_matrix_candidate_sample_size = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float)
			data_matrix_candidate_sample_size[:] = -1
			data_matrix_non_candidate_sample_size = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float)
			data_matrix_non_candidate_sample_size[:] = -1
			data_matrix_candidate_gw_size = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float)
			data_matrix_candidate_gw_size[:] = -1
			data_matrix_non_candidate_gw_size = num.zeros([len(row_info.id2index), len(col_info.id2index)], num.float)
			data_matrix_non_candidate_gw_size[:] = -1
			
			data_matrix_candidate_sample_size_null = num.zeros([len(row_info.id2index), len(col_info.id2index), max_no_of_null_data], num.float)
			data_matrix_candidate_sample_size_null[:] = -1
			data_matrix_candidate_gw_size_null = num.zeros([len(row_info.id2index), len(col_info.id2index), max_no_of_null_data], num.float)
			data_matrix_candidate_gw_size_null[:] = -1
		else:
			data_matrix_candidate_sample_size = None
			data_matrix_non_candidate_sample_size = None
			data_matrix_candidate_gw_size = None
			data_matrix_non_candidate_gw_size = None
			data_matrix_candidate_sample_size_null = None
			data_matrix_candidate_gw_size_null = None
		rows = db.metadata.bind.execute("select t.id, t.no_of_top_snps, t.min_distance, t.pvalue, t.candidate_sample_size, \
			t.non_candidate_sample_size, t.candidate_gw_size, t.non_candidate_gw_size %s"%from_where_clause)
		min_value = None
		max_value = None
		for row in rows:
			row_index = row_info.id2index[row.no_of_top_snps]
			col_index = col_info.id2index[row.min_distance]
			if row.pvalue>0:
				data_value = -math.log10(row.pvalue)
				if min_value==None:
					min_value = data_value
				elif data_value<min_value:
					min_value = data_value
				
				if max_value==None:
					max_value=data_value
				elif data_value>max_value:
					max_value =data_value
			else:
				data_value = -2	#0 pvalue
			data_matrix[row_index, col_index] = data_value
			if need_other_values:
				data_matrix_candidate_sample_size[row_index, col_index] = row.candidate_sample_size
				data_matrix_non_candidate_sample_size[row_index, col_index] = row.non_candidate_sample_size
				data_matrix_candidate_gw_size[row_index, col_index] = row.candidate_gw_size
				data_matrix_non_candidate_gw_size[row_index, col_index] = row.non_candidate_gw_size
				"""
				null_datas = db.metadata.bind.execute("select candidate_sample_size, candidate_gw_size from %s where observed_id=%s and null_distribution_type_id=%s"%\
											(Stock_250kDB.TopSNPTestRMNullData.table.name, row.id, null_distribution_type_id))
				i = 0
				for null_data in null_datas:
					data_matrix_candidate_sample_size_null[row_index, col_index, i] = null_data.candidate_sample_size
					data_matrix_candidate_gw_size_null[row_index, col_index, i] = null_data.candidate_gw_size
					i+=1
					if i>=max_no_of_null_data:	#no more than this
						break
				"""
		sys.stderr.write("Done.\n")
		return_data = PassingData()
		return_data.data_matrix = data_matrix
		return_data.data_matrix_candidate_sample_size = data_matrix_candidate_sample_size
		return_data.data_matrix_non_candidate_sample_size = data_matrix_non_candidate_sample_size
		return_data.data_matrix_candidate_gw_size = data_matrix_candidate_gw_size
		return_data.data_matrix_non_candidate_gw_size = data_matrix_non_candidate_gw_size
		return_data.min_value = min_value
		return_data.max_value = max_value
		return_data.data_matrix_candidate_sample_size_null = data_matrix_candidate_sample_size_null
		return_data.data_matrix_candidate_gw_size_null = data_matrix_candidate_gw_size_null
		return return_data
예제 #11
0
    def run(self):
        """
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""
        #database connection and etc
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, self.QCMethod_class)
        qm = self.QCMethod_class.query.get(self.QC_method_id)  #2009-5-20

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs
        if self.debug:
            import pdb
            pdb.set_trace()

        readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README)
        session.add(readme)

        QC_method_id2snps_table = self.QC_method_id2snps_table

        if self.QC_method_id == 0:
            self.cal_independent_NA_rate(db, self.min_probability, readme)
            row_id2NA_mismatch_rate = None
        else:
            #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename, ignore_het=qm.ignore_het)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id),\
                ignore_het=qm.ignore_het) #category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided.
            """
			if self.input_dir and os.path.isdir(self.input_dir):
				#04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
				#no submission to db
				call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
			"""
            if self.input_dir and os.path.isfile(self.input_dir):  #it's file
                call_info_id2fname = None
            else:
                if self.run_type == 2:  #no filtering on call_info entries that have been QCed.
                    filter_calls_QCed = 0
                elif self.run_type == 1:
                    filter_calls_QCed = 1
                    self.max_call_info_mismatch_rate = 1  #don't use this when doing accession-wise QC
                else:
                    sys.stderr.write("run_type=%s is not supported.\n" %
                                     self.run_type)
                    sys.exit(5)
                call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
                          filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug,\
                          min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir)
                call_info_id2fname = call_data.call_info_id2fname
                call_info_ls_to_return = call_data.call_info_ls_to_return
            if self.run_type == 2:
                snps_name2snps_id = self.get_snps_name2snps_id(db)
            else:
                snps_name2snps_id = None

            if call_info_id2fname:
                db_id2chr_pos = db.getSNPID2ChrPos()  #2011-22
                from DB_250k2data import DB_250k2Data
                db_id2index = DB_250k2Data.getSNPID2index(
                    call_info_id2fname.values()[0][1], db_id2chr_pos)
                if self.one_by_one and self.run_type == 1:  #one_by_one only for QC by accession
                    row_id2NA_mismatch_rate = {}
                    row_id12row_id2 = {}
                    counter = 0
                    for call_info_id, value in call_info_id2fname.iteritems():
                        counter += 1
                        print "No", counter
                        tmp_dict = {}
                        tmp_dict[call_info_id] = value
                        pdata = self.read_call_matrix(
                            tmp_dict,
                            self.min_probability,
                            db_id2chr_pos=db_id2chr_pos,
                            db_id2index=db_id2index)
                        #05/20/09 no need for qm.ignore_het because 250k is all h**o
                        passingdata = self.qcDataMatrixVSsnpData(
                            pdata, snps_name2snps_id, snpData2, curs, session,
                            readme)
                        row_id2NA_mismatch_rate.update(
                            passingdata.row_id2NA_mismatch_rate)
                        row_id12row_id2.update(passingdata.row_id12row_id2)
                        del pdata

                        if self.debug and counter == 10:
                            break
                else:
                    pdata = self.read_call_matrix(call_info_id2fname,
                                                  self.min_probability,
                                                  db_id2chr_pos=db_id2chr_pos,
                                                  db_id2index=db_id2index)
                    #05/20/09 no need for qm.ignore_het because 250k is all h**o
                    passingdata = self.qcDataMatrixVSsnpData(
                        pdata, snps_name2snps_id, snpData2, curs, session,
                        readme)
                    row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                    row_id12row_id2 = passingdata.row_id12row_id2
                    del pdata
            else:
                #input file is SNP by strain format. double header (1st two lines)
                header, snps_name_ls, category_list, data_matrix = read_data(
                    self.input_dir, double_header=1, ignore_het=qm.ignore_het)
                pdata = PassingData()
                pdata.ecotype_id_ls = header[0][2:]
                pdata.call_info_id_ls = header[1][2:]
                data_matrix = numpy.array(data_matrix)
                pdata.data_matrix = data_matrix.transpose()
                pdata.header = ['', ''
                                ] + snps_name_ls  #fake a header for SNPData
                passingdata = self.qcDataMatrixVSsnpData(
                    pdata, snps_name2snps_id, snpData2, curs, session, readme)
                row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                row_id12row_id2 = passingdata.row_id12row_id2
                del pdata

        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, \
                 row_id12row_id2, self.call_method_id, readme)
        if self.commit:
            curs.execute("commit")
            session.commit()
        else:
            session.rollback()

        self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate  #for plone to get the data structure