예제 #1
0
    def run(self):
        """
		2007-04-30
		2007-05-14
			add nt_alphabet_bits
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, int(self.nt_alphabet_bits[0]))
        data_matrix = num.array(data_matrix)
        strain_homo_perc_vector = self.cal_strain_homo_perc_vector(data_matrix)
        snp_locus_log_prob = self.cal_snp_locus_log_prob(
            data_matrix, strain_homo_perc_vector)
        from sets import Set
        cols_to_be_tossed_out_set = Set()
        for i in range(len(snp_locus_log_prob)):
            if snp_locus_log_prob[i] <= min_log_prob:
                cols_to_be_tossed_out_set.add(i)
        print "%sSNPs removed:" % (len(cols_to_be_tossed_out_set))
        for col_index in cols_to_be_tossed_out_set:
            print '\t%s\t%s' % (col_index, header[2 + col_index])
        write_data_matrix(data_matrix,
                          self.output_fname,
                          header,
                          strain_acc_list,
                          category_list,
                          cols_to_be_tossed_out=cols_to_be_tossed_out_set,
                          nt_alphabet=int(self.nt_alphabet_bits[1]))
        import pylab
        pylab.title("histogram of snp locus log probability")
        pylab.hist(snp_locus_log_prob, 20)
        pylab.show()
예제 #2
0
    def run(self):
        """
		2008-05-20 read_call_matrix returns PassingData object
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        db = self.db_250k
        session = db.session
        QC_method_id = 0  #just for QC_250k.get_call_info_id2fname()
        call_data = QC_250k.get_call_info_id2fname(db, QC_method_id, self.call_method_id, filter_calls_QCed=0, \
                  max_call_info_mismatch_rate=self.max_array_mismatch_rate, input_dir=self.input_dir,\
                  take_unique_ecotype=self.take_unique_ecotype)
        #snps_with_best_QC_ls = self.get_snps_with_best_QC_ls(db, self.call_method_id)
        if self.max_snp_mismatch_rate < 1 or self.max_snp_NA_rate < 1:  #2008-05-18 only do this when it's necessary
            snps_name_set = self.get_snps_name_set_given_criteria(
                db, self.call_method_id, self.max_snp_mismatch_rate,
                self.max_snp_NA_rate)
        else:
            snps_name_set = None
        db_id2chr_pos = db.getSNPID2ChrPos()
        if len(call_data.call_info_id2fname) > 0:
            db_id2index = self.getSNPID2index(
                call_data.call_info_id2fname.values()[0][1], db_id2chr_pos)
            pdata = QC_250k.read_call_matrix(call_data.call_info_id2fname, self.min_probability, snps_name_set, \
                    db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #2008-05-20 read_call_matrix returns PassingData object
            strain_acc_list, category_list = pdata.ecotype_id_ls, pdata.array_id_ls
            write_data_matrix(pdata.data_matrix, self.outputFname,
                              pdata.header, strain_acc_list, category_list)
예제 #3
0
	def run(self):
		db = AtDB(drivername=self.drivername, username=self.db_user,
				password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		
		passingdata = self.getAlignmentMatrix(self.alignment_id)
		self.pickPolymorphicColumns(passingdata)
		
		header = ['id', 'name']
		for snp_pos in passingdata.snp_pos_ls:
			header.append('%s_%s_%s'%snp_pos)
		
		if self.strain_id_type==1:
			ecotype_id_ls = []
			for accession_id in passingdata.accession_id_ls:
				rows = db.metadata.bind.execute("select * from %s where accession_id=%s"%('accession2tg_ecotypeid', accession_id))
				row = rows.fetchone()
				ecotype_id_ls.append(row.ecotype_id)
			strain_acc_list = ecotype_id_ls
		elif self.strain_id_type==2:
			strain_acc_list = passingdata.accession_id_ls
		else:
			sys.stderr.write("strain_id_type %s not supported.\n"%(self.strain_id_type))
			sys.exit(2)
		write_data_matrix(passingdata.data_matrix, self.output_fname, header, \
						strain_acc_list, passingdata.name_ls)
예제 #4
0
	def output_data(self, data_to_output_label_ls, data_to_output_ls, min_distance, output_fname):
		"""
		2008-11-11
			data_to_output_ls is a list of (score_cutoff_ls, data_ls). each score_cutoff_ls might be a bit different from each other.
			1. get score_cutoff2index out of all score_cutoffs in descending order
			2. each row is same score_cutoff. column is data_ls of one result from analysis_method on phenotype.
			3. first column is score cutoffs.
			4. 2nd column is min_distance. 3rd and so forth columns are data.
		"""
		sys.stderr.write("Outputting data matrix ...")
		score_cutoff_ls, score_cutoff2index = self.get_score_cutoff2index(data_to_output_ls)
		header = ['score_cutoff', 'min_distance'] + data_to_output_label_ls
		no_of_cols = len(data_to_output_label_ls)
		data_matrix = numpy.zeros([len(score_cutoff2index), no_of_cols], numpy.float)
		data_matrix[:] = -1
		for j in range(no_of_cols):
			sub_score_cutoff_ls, data_ls = data_to_output_ls[j]
			for i in range(len(sub_score_cutoff_ls)):
				score_cutoff = sub_score_cutoff_ls[i]
				data = data_ls[i]
				row_index = score_cutoff2index[score_cutoff]
				data_matrix[row_index][j] = data
		
		category_list = [min_distance]*len(score_cutoff2index)
		write_data_matrix(data_matrix, output_fname, header, score_cutoff_ls, category_list)
		sys.stderr.write("Done.\n")
예제 #5
0
	def run(self):	
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup()
		session = db.session
		
		results_method_id_info = self.getResultsMethodIDInfo(db, self.call_method_id_ls, self.min_distance, self.get_closest, self.min_MAF)
		results_method_id2gene_set = self.getResultsMethodID2GeneSet(db, results_method_id_info, self.results_directory, self.max_rank)
		rdata = self.getDataMatrix(results_method_id2gene_set, results_method_id_info)
		
		header = ['', ''] + results_method_id_info.results_method_id_label_ls
		strain_acc_list = results_method_id_info.results_method_id_label_ls
		category_list = results_method_id_info.results_method_id_ls
		if SNPData.isDataMatrixEmpty(rdata.data_matrix):
			sys.stderr.write("Nothing fetched from database.\n")
			sys.exit(3)
		if self.output_fname:
			write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		if self.fig_fname:
			font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
			value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
			im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
			#im.save('%s_legend.png'%self.fig_fname_prefix)
			im = drawMatrix(rdata.data_matrix, value2color_func, strain_acc_list,\
						strain_acc_list, with_grid=1, font=font)
			im = combineTwoImages(im, im_legend, font=font)
			im.save(self.fig_fname)
    def output_data(self, data_to_output_label_ls, data_to_output_ls,
                    min_distance, output_fname):
        """
		2008-11-11
			data_to_output_ls is a list of (score_cutoff_ls, data_ls). each score_cutoff_ls might be a bit different from each other.
			1. get score_cutoff2index out of all score_cutoffs in descending order
			2. each row is same score_cutoff. column is data_ls of one result from analysis_method on phenotype.
			3. first column is score cutoffs.
			4. 2nd column is min_distance. 3rd and so forth columns are data.
		"""
        sys.stderr.write("Outputting data matrix ...")
        score_cutoff_ls, score_cutoff2index = self.get_score_cutoff2index(
            data_to_output_ls)
        header = ['score_cutoff', 'min_distance'] + data_to_output_label_ls
        no_of_cols = len(data_to_output_label_ls)
        data_matrix = numpy.zeros([len(score_cutoff2index), no_of_cols],
                                  numpy.float)
        data_matrix[:] = -1
        for j in range(no_of_cols):
            sub_score_cutoff_ls, data_ls = data_to_output_ls[j]
            for i in range(len(sub_score_cutoff_ls)):
                score_cutoff = sub_score_cutoff_ls[i]
                data = data_ls[i]
                row_index = score_cutoff2index[score_cutoff]
                data_matrix[row_index][j] = data

        category_list = [min_distance] * len(score_cutoff2index)
        write_data_matrix(data_matrix, output_fname, header, score_cutoff_ls,
                          category_list)
        sys.stderr.write("Done.\n")
예제 #7
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd)
		curs = conn.cursor()
		
		if self.ecotype_duplicate2tg_ecotypeid_table:
			ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(curs, self.ecotype_duplicate2tg_ecotypeid_table)
		else:
			ecotype_duplicate2tg_ecotypeid = None
		from pymodule import figureOutDelimiter
		delimiter = figureOutDelimiter(self.input_fname)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid)
		
		ecotypeid2nativename = get_ecotypeid2nativename(curs, ecotype_table=self.ecotype_table)
		tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \
															ecotypeid2nativename, self.stat_output_fname)
		
		tg_nativename_ls = []
		for ecotypeid in tg_ecotypeid_ls:
			tg_nativename_ls.append(ecotypeid2nativename[ecotypeid])
		header[1] = 'nativename'
		write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
예제 #8
0
    def run(self):
        db = AtDB(drivername=self.drivername,
                  username=self.db_user,
                  password=self.db_passwd,
                  hostname=self.hostname,
                  database=self.dbname,
                  schema=self.schema)

        passingdata = self.getAlignmentMatrix(self.alignment_id)
        self.pickPolymorphicColumns(passingdata)

        header = ['id', 'name']
        for snp_pos in passingdata.snp_pos_ls:
            header.append('%s_%s_%s' % snp_pos)

        if self.strain_id_type == 1:
            ecotype_id_ls = []
            for accession_id in passingdata.accession_id_ls:
                rows = db.metadata.bind.execute(
                    "select * from %s where accession_id=%s" %
                    ('accession2tg_ecotypeid', accession_id))
                row = rows.fetchone()
                ecotype_id_ls.append(row.ecotype_id)
            strain_acc_list = ecotype_id_ls
        elif self.strain_id_type == 2:
            strain_acc_list = passingdata.accession_id_ls
        else:
            sys.stderr.write("strain_id_type %s not supported.\n" %
                             (self.strain_id_type))
            sys.exit(2)
        write_data_matrix(passingdata.data_matrix, self.output_fname, header, \
            strain_acc_list, passingdata.name_ls)
예제 #9
0
    def run(self):
        """
		2007-02-27
		2007-09-14
			filtering_bits
		-read_data()
		-remove_rows_with_too_many_NAs()
		-remove_cols_with_too_many_NAs()
		-remove_identity_strains()
		-write_data_matrix()
		"""
        if self.debug:
            import pdb

            pdb.set_trace()
        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter
        )
        data_matrix = num.array(data_matrix)
        if self.filtering_bits[0] == "1":
            remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff)
            rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
            strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
        else:
            rows_with_too_many_NAs_set = Set()
        if self.filtering_bits[1] == "1":
            remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set)
            cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set
        else:
            cols_with_too_many_NAs_set = Set()
        if self.filtering_bits[2] == "1":
            no_of_rows, no_of_cols = data_matrix.shape
            total_rows_set = Set(range(no_of_rows))
            rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set
            total_cols_set = Set(range(no_of_cols))
            cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set
            identity_strains_to_be_removed = self.remove_identity_strains(
                data_matrix, rows_to_be_checked, cols_to_be_checked
            )
        else:
            identity_strains_to_be_removed = Set()
        rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed
        # self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1]))
        write_data_matrix(
            data_matrix,
            self.output_fname,
            header,
            strain_acc_list,
            category_list,
            rows_to_be_tossed_out,
            cols_with_too_many_NAs_set,
            nt_alphabet=int(self.nt_alphabet_bits[1]),
            delimiter=delimiter,
        )
예제 #10
0
	def run(self):
		if self.debug==1:
			import pdb
			pdb.set_trace()
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd)
		curs = conn.cursor()
		
		
		pheno_data = self.getPhenotypeData(curs, self.phenotype_avg_table, self.phenotype_method_table, \
										self.ecotype_table, get_raw_data=self.get_raw_data)
		header = ['ecotype id', 'nativename'] + pheno_data.col_label_ls
		write_data_matrix(pheno_data.data_matrix, self.output_fname, header, pheno_data.row_id_ls, pheno_data.row_label_ls, \
						transform_to_numpy=False)
예제 #11
0
    def run(self):
        if self.debug == 1:
            import pdb
            pdb.set_trace()

        #import MySQLdb
        #conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd)
        #curs = conn.cursor()

        pheno_data = self.getPhenotypeData(self.db_250k.metadata.bind, self.phenotype_avg_table, self.phenotype_method_table, \
                self.ecotype_table, get_raw_data=self.get_raw_data,\
                getPublicPhenotype=self.getPublicPhenotype)
        header = ['ecotype id', 'nativename'] + pheno_data.col_label_ls
        write_data_matrix(pheno_data.data_matrix, self.outputFname, header, pheno_data.row_id_ls, pheno_data.row_label_ls, \
            transform_to_numpy=False)
예제 #12
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()

        if self.ecotype_duplicate2tg_ecotypeid_table:
            ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(
                curs, self.ecotype_duplicate2tg_ecotypeid_table)
        else:
            ecotype_duplicate2tg_ecotypeid = None
        from pymodule import figureOutDelimiter
        delimiter = figureOutDelimiter(self.input_fname)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(
            strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid)

        ecotypeid2nativename = get_ecotypeid2nativename(
            curs, ecotype_table=self.ecotype_table)
        tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \
                     ecotypeid2nativename, self.stat_output_fname)

        tg_nativename_ls = []
        for ecotypeid in tg_ecotypeid_ls:
            tg_nativename_ls.append(ecotypeid2nativename[ecotypeid])
        header[1] = 'nativename'
        write_data_matrix(merge_matrix,
                          self.output_fname,
                          header,
                          tg_ecotypeid_ls,
                          tg_nativename_ls,
                          delimiter=delimiter)
예제 #13
0
	def run(self):
		"""
		2007-02-27
		2007-09-14
			filtering_bits
		-read_data()
		-remove_rows_with_too_many_NAs()
		-remove_cols_with_too_many_NAs()
		-remove_identity_strains()
		-write_data_matrix()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter)
		data_matrix = numpy.array(data_matrix)
		if self.filtering_bits[0]=='1':
			remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff)
			rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
			strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
		else:
			rows_with_too_many_NAs_set = set()
		if self.filtering_bits[1]=='1':
			remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set)
			cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set			
		else:
			cols_with_too_many_NAs_set = set()
		if self.filtering_bits[2]=='1':
			no_of_rows, no_of_cols = data_matrix.shape
			total_rows_set = set(range(no_of_rows))
			rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set
			total_cols_set = set(range(no_of_cols))
			cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set
			identity_strains_to_be_removed = self.remove_identity_strains(data_matrix, rows_to_be_checked, cols_to_be_checked)
		else:
			identity_strains_to_be_removed = set()
		rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed
		#self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1]))
		write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter)
예제 #14
0
	def run(self):
		"""
		2007-04-30
		2007-05-14
			add nt_alphabet_bits
		"""
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0]))
		data_matrix = num.array(data_matrix)
		strain_homo_perc_vector = self.cal_strain_homo_perc_vector(data_matrix)
		snp_locus_log_prob = self.cal_snp_locus_log_prob(data_matrix, strain_homo_perc_vector)
		from sets import Set
		cols_to_be_tossed_out_set = Set()
		for i in range(len(snp_locus_log_prob)):
			if snp_locus_log_prob[i]<=min_log_prob:
				cols_to_be_tossed_out_set.add(i)
		print "%sSNPs removed:"%(len(cols_to_be_tossed_out_set))
		for col_index in cols_to_be_tossed_out_set:
			print '\t%s\t%s'%(col_index, header[2+col_index])
		write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, cols_to_be_tossed_out=cols_to_be_tossed_out_set, nt_alphabet=int(self.nt_alphabet_bits[1]))
		import pylab
		pylab.title("histogram of snp locus log probability")
		pylab.hist(snp_locus_log_prob, 20)
		pylab.show()
예제 #15
0
	def run(self):
		"""
		2008-05-20 read_call_matrix returns PassingData object
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user,
				   password=self.passwd, hostname=self.hostname, database=self.dbname)
		db.setup(create_tables=False)
		session = db.session
		QC_method_id = 0 	#just for QC_250k.get_call_info_id2fname()
		call_data = QC_250k.get_call_info_id2fname(db, QC_method_id, self.call_method_id, filter_calls_QCed=0, \
												max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, input_dir=self.input_dir,\
												take_unique_ecotype=self.take_unique_ecotype)
		#snps_with_best_QC_ls = self.get_snps_with_best_QC_ls(db, self.call_method_id)
		if self.max_snp_mismatch_rate<1 or self.max_snp_NA_rate<1:	#2008-05-18 only do this when it's necessary
			snps_name_set = self.get_snps_name_set_given_criteria(db, self.call_method_id, self.max_snp_mismatch_rate, self.max_snp_NA_rate)
		else:
			snps_name_set = None
		pdata = QC_250k.read_call_matrix(call_data.call_info_id2fname, self.min_probability, snps_name_set)	#2008-05-20 read_call_matrix returns PassingData object
		strain_acc_list, category_list = pdata.ecotype_id_ls, pdata.array_id_ls
		write_data_matrix(pdata.data_matrix, self.output_fname, pdata.header, strain_acc_list, category_list)
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.db_user,
                                       password=self.db_passwd,
                                       hostname=self.hostname,
                                       database=self.dbname,
                                       schema=self.schema)
        db.setup()
        session = db.session

        if self.test_result_type == 1:
            test_result_class_table = CandidateGeneRankSumTestResult.table.name
            test_result_class_table = 'candidate_gene_rank_sum_test_result_2008_09_15'
        elif self.test_result_type == 2:
            test_result_class_table = CandidateGeneTopSNPTest.table.name
        elif self.test_result_type == 3:
            test_result_class_table = Stock_250kDB.CandidateGeneRankSumTestResultMethod.table.name
        else:
            sys.stderr.write(" test_result_type %s not supported.\n" %
                             (self.test_result_type))
            sys.exit(2)

        #the condition for min_MAF is tricky because of the floating precision.
        if self.test_result_type == 1:
            where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \
				and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"         \
             %(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
        elif self.test_result_type == 2:
            where_condition = "%s r, %s rg, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null and r.id=rg.results_method_id \
				and c.results_id=rg.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"         \
             %(ResultsMethod.table.name, ResultsByGene.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
        elif self.test_result_type == 3:
            where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \
				and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"         \
             %(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
        if self.call_method_id_ls:
            where_condition += " and r.call_method_id in (%s)" % self.call_method_id_ls

        if self.analysis_method_id_ls:
            where_condition += " and r.analysis_method_id in (%s)" % self.analysis_method_id_ls
        if self.super_type_id:
            where_condition += " and g.super_type_id=%s" % self.super_type_id

        if self.test_type:
            where_condition += " and c.test_type=%s" % self.test_type

        if self.test_result_type == 1:
            pass
            where_condition += " and c.max_pvalue_per_gene=%s" % (
                self.max_pvalue_per_gene)
        elif self.test_result_type == 2:
            where_condition += " and c.no_of_top_snps=%s" % (
                self.no_of_top_snps)

        list_type_id_ls = self.getListTypeInfo(db, where_condition)
        analysis_method_id_ls = self.getAnalysisMethodInfo(db, where_condition)
        list_type_analysis_method_info = self.orderListTypeAnalysisMethodID(
            list_type_id_ls, analysis_method_id_ls)
        phenotype_info = self.getPhenotypeInfo(db, where_condition)
        rdata = self.get_data_matrix(db, phenotype_info,
                                     list_type_analysis_method_info,
                                     where_condition)

        rdata.data_matrix = self.markDataMatrixBoundary(
            rdata.data_matrix, phenotype_info, list_type_analysis_method_info)

        header = ['list_type_analysis_method', ''
                  ] + phenotype_info.phenotype_method_label_ls
        strain_acc_list = list_type_analysis_method_info.list_type_analysis_method_label_ls
        category_list = list_type_analysis_method_info.list_type_id_analysis_method_id_ls
        if SNPData.isDataMatrixEmpty(rdata.data_matrix):
            sys.stderr.write("Nothing fetched from database.\n")
            sys.exit(3)
        if self.output_fname:
            write_data_matrix(rdata.data_matrix, self.output_fname, header,
                              strain_acc_list, category_list)

        if self.fig_fname:
            font = get_font(self.font_path,
                            font_size=self.font_size)  #2008-08-01
            value2color_func = lambda x: Value2Color.value2HSLcolor(
                x, rdata.min_value, rdata.max_value)
            im_legend = drawContinousLegend(rdata.min_value, rdata.max_value,
                                            self.no_of_ticks, value2color_func,
                                            font)
            #im.save('%s_legend.png'%self.fig_fname_prefix)
            im = drawMatrix(rdata.data_matrix, value2color_func, list_type_analysis_method_info.list_type_analysis_method_label_ls,\
               phenotype_info.phenotype_method_label_ls, with_grid=1, font=font)
            im = combineTwoImages(im, im_legend, font=font)
            im.save(self.fig_fname)
예제 #17
0
	def run(self):	
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup()
		session = db.session
		
		if self.test_result_type==1:
			test_result_class_table = CandidateGeneRankSumTestResult.table.name
			test_result_class_table = 'candidate_gene_rank_sum_test_result_2008_09_15'
		elif self.test_result_type==2:
			test_result_class_table = CandidateGeneTopSNPTest.table.name
		elif self.test_result_type==3:
			test_result_class_table = Stock_250kDB.CandidateGeneRankSumTestResultMethod.table.name
		else:
			sys.stderr.write(" test_result_type %s not supported.\n"%(self.test_result_type))
			sys.exit(2)

		#the condition for min_MAF is tricky because of the floating precision.
		if self.test_result_type==1:
			where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \
				and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\
				%(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
		elif self.test_result_type==2:
			where_condition = "%s r, %s rg, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null and r.id=rg.results_method_id \
				and c.results_id=rg.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\
				%(ResultsMethod.table.name, ResultsByGene.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
		elif self.test_result_type==3:
			where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \
				and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\
				%(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
		if self.call_method_id_ls:
			where_condition += " and r.call_method_id in (%s)"%self.call_method_id_ls
		
		if self.analysis_method_id_ls:
			where_condition += " and r.analysis_method_id in (%s)"%self.analysis_method_id_ls
		if self.super_type_id:
			where_condition += " and g.super_type_id=%s"%self.super_type_id
		
		if self.test_type:
			where_condition += " and c.test_type=%s"%self.test_type
		
		if self.test_result_type==1:
			pass
			where_condition += " and c.max_pvalue_per_gene=%s"%(self.max_pvalue_per_gene)
		elif self.test_result_type==2:
			where_condition += " and c.no_of_top_snps=%s"%(self.no_of_top_snps)		
		
		list_type_id_ls = self.getListTypeInfo(db, where_condition)
		analysis_method_id_ls = self.getAnalysisMethodInfo(db, where_condition)
		list_type_analysis_method_info = self.orderListTypeAnalysisMethodID(list_type_id_ls, analysis_method_id_ls)
		phenotype_info = self.getPhenotypeInfo(db, where_condition)
		rdata = self.get_data_matrix(db, phenotype_info, list_type_analysis_method_info, where_condition)
		
		rdata.data_matrix = self.markDataMatrixBoundary(rdata.data_matrix, phenotype_info, list_type_analysis_method_info)
		
		header = ['list_type_analysis_method', ''] + phenotype_info.phenotype_method_label_ls
		strain_acc_list = list_type_analysis_method_info.list_type_analysis_method_label_ls
		category_list = list_type_analysis_method_info.list_type_id_analysis_method_id_ls
		if SNPData.isDataMatrixEmpty(rdata.data_matrix):
			sys.stderr.write("Nothing fetched from database.\n")
			sys.exit(3)
		if self.output_fname:
			write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		if self.fig_fname:
			font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
			value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
			im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
			#im.save('%s_legend.png'%self.fig_fname_prefix)
			im = drawMatrix(rdata.data_matrix, value2color_func, list_type_analysis_method_info.list_type_analysis_method_label_ls,\
						phenotype_info.phenotype_method_label_ls, with_grid=1, font=font)
			im = combineTwoImages(im, im_legend, font=font)
			im.save(self.fig_fname)
예제 #18
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
        db = StockDB.StockDB(drivername=self.drivername,
                             username=self.db_user,
                             password=self.db_passwd,
                             hostname=self.hostname,
                             database=self.dbname,
                             schema=self.schema)
        db.setup(create_tables=False)
        session = db.session
        order_by_sentence = " order by c.longitude, c.latitude, e.longitude, e.latitude, e.nativename "  #how to order strains.
        if self.QC_method_id == 4:
            sql_table_str = "from %s e, %s s, %s a, %s c"%(StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\
                 StockDB.Country.table.name)
            common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s " + order_by_sentence

            strain_where_condition = common_where_condition % (
                " and e.id=st.ecotypeid")
            strain_id_info_query = "select distinct st.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s" % (
                sql_table_str, StockDB.Strain.table.name,
                strain_where_condition)
        else:
            sql_table_str = "from %s q, %s e, %s s, %s a, %s c"%(StockDB.QCCrossMatch.table.name, StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\
                  StockDB.Country.table.name)
            common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s"+ " and q.qc_method_id=%s and q.no_of_non_NA_pairs>=%s and q.mismatch_rate<=%s "%\
             (self.QC_method_id, self.min_no_of_non_NAs, self.max_mismatch_rate) + order_by_sentence

            strain_where_condition = common_where_condition % (
                " and e.id=st.ecotypeid and st.id=q.strainid")
            strain_id_info_query = "select distinct q.strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s" % (
                sql_table_str, StockDB.Strain.table.name,
                strain_where_condition)

        if self.how_to_group_strains == 2 or self.how_to_group_strains == 3:
            plate_info = self.alignStrainsAccordingToSeqPlate(db)
            id_set_data = PassingData()
            id_set_data.strain_id_set = None
            id_set_data.target_id_set = None
        elif self.input_fname:
            id_set_data = self.getStrainidTargetidFromFile(
                db, self.QC_method_id, self.input_fname,
                self.max_mismatch_rate, self.min_no_of_non_NAs)
        else:
            id_set_data = PassingData()
            id_set_data.strain_id_set = None
            id_set_data.target_id_set = None

        if self.how_to_group_strains == 2 or self.how_to_group_strains == 3:
            strain_id_info = self.getStrainInfoGivenPlateInfo(
                db, plate_info, strain_id_info_query, strain_id_set=None)
        else:
            strain_id_info = self.getStrainIDInfo(db, strain_id_info_query,
                                                  id_set_data.strain_id_set)

        if self.QC_method_id == 4:
            if self.how_to_group_strains == 3:
                #2008-09-15 column strain id is in country, strain-longitude order
                target_id_info = self.getStrainIDInfo(
                    db, strain_id_info_query, id_set_data.strain_id_set)
            else:
                target_id_info = strain_id_info
        else:
            target_where_condition = common_where_condition % (
                " and e.id=q.target_id")
            target_id_info_query = "select distinct e.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s %s" % (
                sql_table_str, target_where_condition)
            target_id_info = self.getStrainIDInfo(db, target_id_info_query)

        if self.input_fname:
            rdata = self.get_data_matrixFromFile(db, strain_id_info,
                                                 target_id_info,
                                                 self.QC_method_id,
                                                 self.input_fname,
                                                 self.max_mismatch_rate,
                                                 self.min_no_of_non_NAs)
        else:
            rdata = self.get_data_matrix(db, strain_id_info, target_id_info,
                                         self.QC_method_id,
                                         self.max_mismatch_rate,
                                         self.min_no_of_non_NAs)

        rdata.data_matrix = self.markDataMatrixBoundary(
            rdata.data_matrix, strain_id_info, target_id_info)

        header = ['strain info', ''] + target_id_info.strain_label_ls
        strain_acc_list = strain_id_info.strain_label_ls
        category_list = [1] * len(strain_acc_list)
        if SNPData.isDataMatrixEmpty(rdata.data_matrix):
            sys.stderr.write("Nothing fetched from database.\n")
            sys.exit(3)
        if self.output_fname:
            write_data_matrix(rdata.data_matrix, self.output_fname, header,
                              strain_acc_list, category_list)

        if self.fig_fname:
            font = get_font(self.font_path,
                            font_size=self.font_size)  #2008-08-01
            value2color_func = lambda x: Value2Color.value2HSLcolor(
                x, rdata.min_value, rdata.max_value)
            im_legend = drawContinousLegend(rdata.min_value, rdata.max_value,
                                            self.no_of_ticks, value2color_func,
                                            font)
            #im.save('%s_legend.png'%self.fig_fname_prefix)
            im = drawMatrix(rdata.data_matrix, value2color_func, strain_id_info.strain_label_ls,\
               target_id_info.strain_label_ls, with_grid=1, font=font)
            im = combineTwoImages(im, im_legend, font=font)
            im.save(self.fig_fname)
    def run(self):
        """
		2008-11-08
			generate combinations of results_id, list_type_id and generate plots one after another
			save the plots into database if commit=1
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.db_user,
                                       password=self.db_passwd,
                                       hostname=self.hostname,
                                       database=self.dbname,
                                       schema=self.schema)
        db.setup()
        session = db.session

        param_obj = PassingData(call_method_id=self.call_method_id, \
              analysis_method_id=getattr(self, 'analysis_method_id', None),\
              analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\
              phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\
              list_type_id_ls=self.list_type_id_ls, \
              results_type=self.results_type)
        params_ls = MpiGeneListRankTest.generate_params(param_obj)

        ResultsClass, TestResultClass = db.getResultsAndTestResultsClass(
            results_type=self.results_type)

        if ResultsClass is None or TestResultClass is None:
            sys.stderr.write("Invalid results type : %s.\n" % pd.results_type)
            sys.exit(3)

        for results_id, list_type_id in params_ls:
            rm = ResultsClass.get(results_id)
            list_type = Stock_250kDB.GeneListType.get(list_type_id)
            title = 'result(%s) of %s on %s with %s(%s) list'%\
             (results_id, rm.analysis_method.short_name, rm.phenotype_method.short_name, list_type.short_name, list_type.id)

            TopSNPTestType_id_ls = self.getTopSNPTestType_id_ls(self.get_closest, self.min_MAF, self.allow_two_sample_overlapping, self.results_type, \
                 self.test_type_id, self.null_distribution_type_id)
            if self.commit:
                rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\
                 filter_by(results_id=results_id).filter_by(list_type_id=list_type_id)
                if rows.count() > 0:
                    row = rows.first()
                    sys.stderr.write(
                        '%s already in db (%s of them) with first id=%s.\n' %
                        (title, rows.count(), row.id))
                    continue

            if not TopSNPTestType_id_ls:
                sys.stderr.write(
                    "No TopSNPTestType matches the input requirements. Exit.\n"
                )
                sys.exit(3)
            TopSNPTestType_id_ls_str = map(str, TopSNPTestType_id_ls)
            from_where_clause = "from %s t, %s y where t.type_id=y.id and t.results_id=%s and t.list_type_id=%s and y.id in (%s)"%\
             (TestResultClass.table.name, Stock_250kDB.CandidateGeneTopSNPTestRMType.table.name,\
             results_id, list_type_id, ','.join(TopSNPTestType_id_ls_str))

            no_of_top_snps_info = self.get_no_of_top_snps_info(
                db, from_where_clause)
            min_distance_info = self.get_min_distance_info(
                db, from_where_clause)
            rdata = self.get_data_matrix(db, no_of_top_snps_info, min_distance_info, from_where_clause, need_other_values=True, \
                   null_distribution_type_id=self.null_distribution_type_id)

            header = ['no_of_top_snps', ''] + min_distance_info.label_ls
            strain_acc_list = no_of_top_snps_info.label_ls
            category_list = no_of_top_snps_info.label_ls

            if SNPData.isDataMatrixEmpty(rdata.data_matrix):
                sys.stderr.write("Nothing fetched from database.\n")
                #sys.exit(3)
                continue

            if self.output_fname:
                write_data_matrix(rdata.data_matrix, self.output_fname, header,
                                  strain_acc_list, category_list)
            """
			if self.fig_fname:
				font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
				value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
				im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
				#im.save('%s_legend.png'%self.fig_fname_prefix)
				im = drawMatrix(rdata.data_matrix, value2color_func, no_of_top_snps_info.label_ls,\
							min_distance_info.label_ls, with_grid=1, font=font)
				im = combineTwoImages(im, im_legend, font=font)
				im.save(self.fig_fname)
			"""
            if self.commit:
                output_fname_prefix = None
            else:
                title_cp = title
                title_cp = title_cp.replace('/', '_')
                output_fname_prefix = '%s_%s_type_%s.png' % (os.path.splitext(
                    self.fig_fname)[0], title_cp, TopSNPTestType_id_ls[0])

            if rm.analysis_method_id == 1 or rm.analysis_method_id == 7:
                preset_xlim = [0, 8]
                preset_xlim = None
            else:
                preset_xlim = None
            return_data = self.plotCurve(rdata,
                                         no_of_top_snps_info,
                                         min_distance_info,
                                         output_fname_prefix,
                                         title=title,
                                         commit=self.commit,
                                         preset_xlim=preset_xlim)

            if self.commit and return_data.png_data:
                rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\
                 filter_by(results_id=results_id).filter_by(list_type_id=list_type_id)
                if rows.count() > 0:
                    row = rows.first()
                    sys.stderr.write(
                        '%s already in db (%s of them) with first id=%s.\n' %
                        (title, rows.count(), row.id))
                    continue
                plot = Stock_250kDB.CandidateVsNonRatioPlot(
                    type_id=TopSNPTestType_id_ls[0],
                    results_id=results_id,
                    list_type_id=list_type_id)
                plot.png_thumbnail = return_data.png_thumbnail.getvalue()
                plot.png_data = return_data.png_data.getvalue()
                plot.svg_data = return_data.svg_data.getvalue()
                db.session.save(plot)
                db.session.flush()
예제 #20
0
    def run(self):
        """
		2008-05-08
			transpose everything if output_matrix_type=1 (bjarni's SNP matrix format)
		2007-02-19
			--db_connect
			--get_snp_id2index()
			--get_strain_id2index()
			--get_strain_id_info()
			--get_snp_id_info()
			--get_data_matrix()
			if self.toss_out_rows:
				--toss_rows_to_make_distance_matrix_NA_free()
					--find_smallest_vertex_set_to_remove_all_edges()
			--write_data_matrix()
			#--sort_file()
		2007-09-22
			for mysql_connection
				add get_nativename_snpid2call_m()
				add fill_in_resolved_duplicated_calls()
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        if self.db_connection_type == 1:
            import MySQLdb
            #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
            conn = MySQLdb.connect(db=self.dbname,
                                   host=self.hostname,
                                   user=self.user,
                                   passwd=self.passwd)
            curs = conn.cursor()
            snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m(
                curs, self.input_table, self.snp_locus_table)
            strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \
                         self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \
                         self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants)

            #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table)
            #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
            if self.input_table == 'dbsnp.calls':
                from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping
                snps_id2mapping = get_snps_id2mapping(self.hostname,
                                                      dbname='dbsnp',
                                                      user=self.user,
                                                      passwd=self.passwd)
            else:
                snps_id2mapping = None
            data_matrix = self.get_data_matrix_m(curs, strain_id2index,
                                                 snp_id2index, nt2number,
                                                 self.input_table,
                                                 self.need_heterozygous_call,
                                                 snps_id2mapping)
            """
			if self.resolve_duplicated_calls:
				nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table)
				data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call)
			"""
            if self.include_other_strain_info:
                strain_id2other_info = self.get_strain_id2other_info(
                    curs, strain_id_list, self.strain_info_table,
                    self.input_table)
            else:
                strain_id2other_info = {}
        elif self.db_connection_type == 2:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            snp_id2index, snp_id_list = self.get_snp_id2index(
                curs, self.input_table, self.snp_locus_table)
            strain_id2index, strain_id_list = self.get_strain_id2index(
                curs, self.input_table)

            strain_id2acc, strain_id2category = self.get_strain_id_info(
                curs, strain_id_list, self.strain_info_table)
            snp_id2info = self.get_snp_id_info(curs, snp_id_list,
                                               self.snp_locus_table)
            data_matrix = self.get_data_matrix(curs, strain_id2index,
                                               snp_id2index, nt2number,
                                               self.input_table,
                                               self.need_heterozygous_call)
            strain_id2other_info = {}

        if self.toss_out_rows:
            rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free(
                data_matrix)
            rows_to_be_tossed_out = Set(rows_to_be_tossed_out)
        else:
            rows_to_be_tossed_out = Set()

        #05/08/08
        if self.discard_all_NA_strain:
            from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs(
                data_matrix, row_cutoff=1)
            rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
            #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
            rows_to_be_tossed_out.update(rows_with_too_many_NAs_set)

        strain_acc_list = [
            strain_id2acc[strain_id] for strain_id in strain_id_list
        ]
        category_list = [
            strain_id2category[strain_id] for strain_id in strain_id_list
        ]

        strain_acc2other_info = {}
        for strain_id in strain_id2other_info:
            strain_acc2other_info[
                strain_id2acc[strain_id]] = strain_id2other_info[strain_id]

        if self.output_matrix_type == 1:
            #transpose everything
            data_matrix = num.array(data_matrix)
            data_matrix = num.transpose(data_matrix)

            header = ['Chromosomes', 'Positions'] + strain_acc_list
            chromosome_ls = []
            position_ls = []
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                chromosome_ls.append(chromosome)
                position_ls.append(position)

            strain_acc_list = chromosome_ls
            category_list = position_ls
            cols_to_be_tossed_out = rows_to_be_tossed_out
            rows_to_be_tossed_out = None
            strain_id2other_info = None  #make up one
        else:
            header = ['strain', 'category']
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                header.append(snp_name)
            cols_to_be_tossed_out = None

        write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \
           cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\
           strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)
예제 #21
0
	def run(self):
		"""
		2008-11-08
			generate combinations of results_id, list_type_id and generate plots one after another
			save the plots into database if commit=1
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup()
		session = db.session
		
		param_obj = PassingData(call_method_id=self.call_method_id, \
								analysis_method_id=getattr(self, 'analysis_method_id', None),\
								analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\
								phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\
								list_type_id_ls=self.list_type_id_ls, \
								results_type=self.results_type)
		params_ls = MpiGeneListRankTest.generate_params(param_obj)
		
		for results_id, list_type_id in params_ls:
			rm = Stock_250kDB.ResultsMethod.get(results_id)
			list_type = Stock_250kDB.GeneListType.get(list_type_id)
			title = 'result(%s) of %s on %s with %s(%s) list'%\
				(results_id, rm.analysis_method.short_name, rm.phenotype_method.short_name, list_type.short_name, list_type.id)
			
			TopSNPTestType_id_ls = self.getTopSNPTestType_id_ls(self.get_closest, self.min_MAF, self.allow_two_sample_overlapping, self.results_type, \
								self.test_type_id, self.null_distribution_type_id)
			if self.commit:
				rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\
					filter_by(results_id=results_id).filter_by(list_type_id=list_type_id)
				if rows.count()>0:
					row = rows.first()
					sys.stderr.write('%s already in db (%s of them) with first id=%s.\n'%(title, rows.count(), row.id))
					continue
			
			if not TopSNPTestType_id_ls:
				sys.stderr.write("No TopSNPTestType matches the input requirements. Exit.\n")
				sys.exit(3)
			TopSNPTestType_id_ls_str = map(str, TopSNPTestType_id_ls)
			from_where_clause = "from %s t, %s y where t.type_id=y.id and t.results_id=%s and t.list_type_id=%s and y.id in (%s)"%\
				(Stock_250kDB.CandidateGeneTopSNPTestRM.table.name, Stock_250kDB.CandidateGeneTopSNPTestRMType.table.name,\
				results_id, list_type_id, ','.join(TopSNPTestType_id_ls_str))
			
			no_of_top_snps_info = self.get_no_of_top_snps_info(db, from_where_clause)
			min_distance_info = self.get_min_distance_info(db, from_where_clause)
			rdata = self.get_data_matrix(db, no_of_top_snps_info, min_distance_info, from_where_clause, need_other_values=True, \
										null_distribution_type_id=self.null_distribution_type_id)
			
			header = ['no_of_top_snps', ''] + min_distance_info.label_ls
			strain_acc_list = no_of_top_snps_info.label_ls
			category_list = no_of_top_snps_info.label_ls
			
			if SNPData.isDataMatrixEmpty(rdata.data_matrix):
				sys.stderr.write("Nothing fetched from database.\n")
				#sys.exit(3)
				continue
			
			if self.output_fname:
				write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list)
			
			"""
			if self.fig_fname:
				font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
				value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
				im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
				#im.save('%s_legend.png'%self.fig_fname_prefix)
				im = drawMatrix(rdata.data_matrix, value2color_func, no_of_top_snps_info.label_ls,\
							min_distance_info.label_ls, with_grid=1, font=font)
				im = combineTwoImages(im, im_legend, font=font)
				im.save(self.fig_fname)
			"""
			if self.commit:
				output_fname_prefix = None
			else:
				title_cp = title
				title_cp = title_cp.replace('/', '_')
				output_fname_prefix='%s_%s_type_%s.png'%(os.path.splitext(self.fig_fname)[0], title_cp, TopSNPTestType_id_ls[0])
			
			if rm.analysis_method_id ==1 or rm.analysis_method_id==7:
				preset_xlim = [0,8]
			else:
				preset_xlim = None
			return_data = self.plotCurve(rdata, no_of_top_snps_info, min_distance_info, output_fname_prefix, title=title, commit=self.commit, preset_xlim=preset_xlim)
			
			if self.commit and return_data.png_data:
				rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\
					filter_by(results_id=results_id).filter_by(list_type_id=list_type_id)
				if rows.count()>0:
					row = rows.first()
					sys.stderr.write('%s already in db (%s of them) with first id=%s.\n'%(title, rows.count(), row.id))
					continue
				plot = Stock_250kDB.CandidateVsNonRatioPlot(type_id=TopSNPTestType_id_ls[0], results_id=results_id, list_type_id=list_type_id)
				plot.png_thumbnail = return_data.png_thumbnail.getvalue()
				plot.png_data = return_data.png_data.getvalue()
				plot.svg_data = return_data.svg_data.getvalue()
				db.session.save(plot)
				db.session.flush()
예제 #22
0
	def run(self):	
		if self.debug:
			import pdb
			pdb.set_trace()
		db = StockDB.StockDB(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		order_by_sentence = " order by c.longitude, c.latitude, e.longitude, e.latitude, e.nativename "	#how to order strains.
		if self.QC_method_id ==4:
			sql_table_str = "from %s e, %s s, %s a, %s c"%(StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\
								StockDB.Country.table.name)
			common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s " + order_by_sentence
			
			strain_where_condition = common_where_condition%(" and e.id=st.ecotypeid")
			strain_id_info_query = "select distinct st.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s"%(sql_table_str, StockDB.Strain.table.name, strain_where_condition)
		else:
			sql_table_str = "from %s q, %s e, %s s, %s a, %s c"%(StockDB.QCCrossMatch.table.name, StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\
									StockDB.Country.table.name)
			common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s"+ " and q.qc_method_id=%s and q.no_of_non_NA_pairs>=%s and q.mismatch_rate<=%s "%\
				(self.QC_method_id, self.min_no_of_non_NAs, self.max_mismatch_rate) + order_by_sentence
			
			strain_where_condition = common_where_condition%(" and e.id=st.ecotypeid and st.id=q.strainid")
			strain_id_info_query = "select distinct q.strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s"%(sql_table_str, StockDB.Strain.table.name, strain_where_condition)
		
		if self.how_to_group_strains==2 or self.how_to_group_strains==3:
			plate_info = self.alignStrainsAccordingToSeqPlate(db)
			id_set_data = PassingData()
			id_set_data.strain_id_set = None
			id_set_data.target_id_set = None
		elif self.input_fname:
			id_set_data = self.getStrainidTargetidFromFile(db, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs)
		else:
			id_set_data = PassingData()
			id_set_data.strain_id_set = None
			id_set_data.target_id_set = None
		
		if self.how_to_group_strains==2 or self.how_to_group_strains==3:
			strain_id_info = self.getStrainInfoGivenPlateInfo(db, plate_info, strain_id_info_query, strain_id_set=None)
		else:
			strain_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set)
		
		if self.QC_method_id==4:
			if self.how_to_group_strains==3:
				#2008-09-15 column strain id is in country, strain-longitude order
				target_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set)
			else:
				target_id_info = strain_id_info
		else:
			target_where_condition = common_where_condition%(" and e.id=q.target_id")
			target_id_info_query = "select distinct e.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s %s"%(sql_table_str, target_where_condition)
			target_id_info = self.getStrainIDInfo(db, target_id_info_query)
		
		if self.input_fname:
			rdata = self.get_data_matrixFromFile(db, strain_id_info, target_id_info,  self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs)
		else:
			rdata = self.get_data_matrix(db, strain_id_info, target_id_info, self.QC_method_id, self.max_mismatch_rate, self.min_no_of_non_NAs)
		
		rdata.data_matrix = self.markDataMatrixBoundary(rdata.data_matrix, strain_id_info, target_id_info)
		
		header = ['strain info', ''] + target_id_info.strain_label_ls
		strain_acc_list = strain_id_info.strain_label_ls
		category_list = [1]*len(strain_acc_list)
		if SNPData.isDataMatrixEmpty(rdata.data_matrix):
			sys.stderr.write("Nothing fetched from database.\n")
			sys.exit(3)
		if self.output_fname:
			write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		if self.fig_fname:
			font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
			value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
			im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
			#im.save('%s_legend.png'%self.fig_fname_prefix)
			im = drawMatrix(rdata.data_matrix, value2color_func, strain_id_info.strain_label_ls,\
						target_id_info.strain_label_ls, with_grid=1, font=font)
			im = combineTwoImages(im, im_legend, font=font)
			im.save(self.fig_fname)
예제 #23
0
	def run(self):
		"""
		2008-05-08
			transpose everything if output_matrix_type=1 (bjarni's SNP matrix format)
		2007-02-19
			--db_connect
			--get_snp_id2index()
			--get_strain_id2index()
			--get_strain_id_info()
			--get_snp_id_info()
			--get_data_matrix()
			if self.toss_out_rows:
				--toss_rows_to_make_distance_matrix_NA_free()
					--find_smallest_vertex_set_to_remove_all_edges()
			--write_data_matrix()
			#--sort_file()
		2007-09-22
			for mysql_connection
				add get_nativename_snpid2call_m()
				add fill_in_resolved_duplicated_calls()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		if self.db_connection_type==1:
			import MySQLdb
			#conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
			conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd)
			curs = conn.cursor()
			snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table)
			strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \
																self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \
																self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants)
			
			#strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table)
			#snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
			if self.input_table == 'dbsnp.calls':
				from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping
				snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd)
			else:
				snps_id2mapping = None
			data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping)
			"""
			if self.resolve_duplicated_calls:
				nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table)
				data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call)
			"""
			if self.include_other_strain_info:
				strain_id2other_info = self.get_strain_id2other_info(curs, strain_id_list, self.strain_info_table, self.input_table)
			else:
				strain_id2other_info = {}
		elif self.db_connection_type==2:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			snp_id2index, snp_id_list = self.get_snp_id2index(curs, self.input_table, self.snp_locus_table)
			strain_id2index, strain_id_list = self.get_strain_id2index(curs, self.input_table)
			
			strain_id2acc, strain_id2category = self.get_strain_id_info(curs, strain_id_list, self.strain_info_table)
			snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table)
			data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call)
			strain_id2other_info = {}
		
		if self.toss_out_rows:
			rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free(data_matrix)
			rows_to_be_tossed_out = Set(rows_to_be_tossed_out)
		else:
			rows_to_be_tossed_out = Set()
		
		#05/08/08
		if self.discard_all_NA_strain:
			from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
			remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs(data_matrix, row_cutoff=1)
			rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
			#row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
			rows_to_be_tossed_out.update(rows_with_too_many_NAs_set)
		
		strain_acc_list = [strain_id2acc[strain_id] for strain_id in strain_id_list]
		category_list = [strain_id2category[strain_id] for strain_id in strain_id_list]
		
		strain_acc2other_info = {}
		for strain_id in strain_id2other_info:
			strain_acc2other_info[strain_id2acc[strain_id]] = strain_id2other_info[strain_id]
		
		if self.output_matrix_type==1:
			#transpose everything
			data_matrix = num.array(data_matrix)
			data_matrix = num.transpose(data_matrix)
			
			header = ['Chromosomes', 'Positions'] + strain_acc_list
			chromosome_ls = []
			position_ls = []
			for snp_id in snp_id_list:
				snp_name, chromosome, position = snp_id2info[snp_id]
				chromosome_ls.append(chromosome)
				position_ls.append(position) 
			
			strain_acc_list = chromosome_ls
			category_list = position_ls
			cols_to_be_tossed_out = rows_to_be_tossed_out
			rows_to_be_tossed_out = None
			strain_id2other_info = None	#make up one
		else:
			header = ['strain', 'category']
			for snp_id in snp_id_list:
				snp_name, chromosome, position = snp_id2info[snp_id]
				header.append(snp_name)
			cols_to_be_tossed_out = None
		
		write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \
					cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\
					strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)