예제 #1
0
	def registerExecutables(self, workflow=None):
		"""
		2012.2.15
		"""
		AbstractWorkflow.registerExecutables(self)
		
		namespace = self.namespace
		version = self.version
		operatingSystem = self.operatingSystem
		architecture = self.architecture
		clusters_size = self.clusters_size
		site_handler = self.site_handler
		variationSrcPath = self.variationSrcPath
		vervetSrcPath = self.vervetSrcPath
		
		#2012.8.7 each cell is a tuple of (executable, clusterSizeMultipler (0 if u do not need clustering)
		executableClusterSizeMultiplierList = []
		
		Stock_250kDB = Executable(namespace=namespace, name="Stock_250kDB", version=version, \
						os=operatingSystem, arch=architecture, installed=True)
		Stock_250kDB.addPFN(PFN("file://" + os.path.join(self.variationSrcPath, "db/Stock_250kDB.py"), site_handler))
		executableClusterSizeMultiplierList.append((Stock_250kDB, 0))
		
		self.addExecutableAndAssignProperClusterSize(executableClusterSizeMultiplierList, defaultClustersSize=self.clusters_size)
		
		
예제 #2
0
	def getTopSNPTestType(self, get_closest, min_MAF, allow_two_sample_overlapping, results_type,\
				test_type_id, null_distribution_type_id):
		"""
		2008-10-30
			null_distribution_type_id in CandidateGeneTopSNPTestRMType doesn't matter anymore.
			set it to 1
		2008-10-26
			min_distance is removed from CandidateGeneTopSNPTestRMType.
		2008-10-16
			check whcih TopSNPTest type this is, create one if it doesn't exist in db
		"""
		if self.debug:
			sys.stderr.write("Getting  CandidateGeneTopSNPTestRMType ...")
		rows = Stock_250kDB.CandidateGeneTopSNPTestRMType.query.\
				filter_by(get_closest =get_closest).\
				filter(Stock_250kDB.CandidateGeneTopSNPTestRMType.min_MAF>=min_MAF-0.0001).filter(Stock_250kDB.CandidateGeneTopSNPTestRMType.min_MAF<=min_MAF+0.0001).\
				filter_by(allow_two_sample_overlapping = allow_two_sample_overlapping).filter_by(results_type=results_type).\
				filter_by(test_type_id=test_type_id).\
				filter_by(null_distribution_type_id=null_distribution_type_id)
		if rows.count()>0:
			_type = rows.first()
		else:
			_type = Stock_250kDB.CandidateGeneTopSNPTestRMType(get_closest =get_closest,\
										min_MAF = min_MAF,\
										allow_two_sample_overlapping = allow_two_sample_overlapping, \
										results_type=results_type,\
										test_type_id=test_type_id,\
										null_distribution_type_id=null_distribution_type_id)
		if self.debug:
			sys.stderr.write("Done.\n")
		return _type
예제 #3
0
    def submit_to_call_QC(cls, session, row_id2NA_mismatch_rate, QC_method_id,
                          user, min_probability, row_id12row_id2,
                          call_method_id, readme):
        """
		2008-05-21
			ecotype_id, call_info_id = row_id	#bug here, order changed.
		2008-05-19
			NA_mismatch_ls was expanded
		2008-05-06
			add readme
		2008-05-05
			add ecotype_id, min_probability, tg_ecotype_id
		"""
        sys.stderr.write("Submitting row_id2NA_mismatch_rate to database ...")
        row_id_ls = row_id2NA_mismatch_rate.keys()
        row_id_ls.sort()  #try to keep them in call_info_id order
        for row_id in row_id_ls:
            NA_mismatch_ls = row_id2NA_mismatch_rate[row_id]
            ecotype_id, call_info_id = row_id  #bug here, order changed.
            tg_ecotype_id = row_id12row_id2[row_id]
            na_rate, mismatch_rate, no_of_nas, no_of_totals, no_of_mismatches, no_of_non_na_pairs, relative_NA_rate, relative_no_of_NAs, relative_no_of_totals = NA_mismatch_ls
            #call_QC stores the relative NA rate. call_info already stores the independent NA rate
            na_rate, no_of_nas, no_of_totals = relative_NA_rate, relative_no_of_NAs, relative_no_of_totals
            callqc = Stock_250kDB.CallQC(call_info_id=call_info_id, min_probability=min_probability, ecotype_id=ecotype_id, tg_ecotype_id=tg_ecotype_id,\
               qc_method_id=QC_method_id, call_method_id=call_method_id, na_rate=na_rate, mismatch_rate=mismatch_rate,\
               no_of_nas=no_of_nas, no_of_totals=no_of_totals, no_of_mismatches=no_of_mismatches, no_of_non_na_pairs=no_of_non_na_pairs,\
               created_by=user)
            callqc.readme = readme
            session.add(callqc)
            """
			data_insert_ls = [row_id[0]] + NA_mismatch_ls + [QC_method_id, user]	#row_id is (call_info_id, ecotypeid)
			curs.execute("insert into " + call_QC_table + " (call_info_id, na_rate, mismatch_rate, no_of_nas, no_of_totals, no_of_mismatches, no_of_non_NA_pairs, QC_method_id, created_by)\
				values(%s, %s, %s, %s, %s, %s, %s, %s, %s)", data_insert_ls)
			"""
        sys.stderr.write("Done.\n")
    def connectDB(self):
        """
		2012.11.18
		"""
        db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, db_user=self.db_user, db_passwd=self.db_passwd, \
               hostname=self.hostname, dbname=self.dbname, schema=self.schema, port=self.port)
        db_250k.setup(create_tables=False)
        self.db_250k = db_250k
예제 #5
0
	def connectDB(self):
		"""
		2012.6.5
			overwrite the parent class
		"""
		self.db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, db_user=self.db_user,
				db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname, schema=self.schema,\
				port=self.port)
		self.db_250k.setup(create_tables=False)
    def getThresholdType(self, r1_pvalue_cutoff, r2_pvalue_cutoff):
        """
		2009-5-2
			
		"""
        threshold_type = Stock_250kDB.CmpEnrichmentOfTwoAnalysisMethodsType.query.filter_by(r1_threshold=r1_pvalue_cutoff).\
         filter_by(r2_threshold=r2_pvalue_cutoff).first()
        if not threshold_type:
            threshold_type = Stock_250kDB.CmpEnrichmentOfTwoAnalysisMethodsType(
                r1_threshold=r1_pvalue_cutoff, r2_threshold=r2_pvalue_cutoff)
        return threshold_type
예제 #7
0
    def run(self):
        """
		2012.3.2
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \
               hostname=self.hostname, database=self.dbname)
        db_250k.setup(create_tables=False)

        # Create a abstract dag
        workflowName = os.path.splitext(os.path.basename(self.outputFname))[0]
        workflow = self.initiateWorkflow(workflowName)

        self.registerExecutables(workflow)
        self.registerCustomExecutables(workflow)

        #find all hdf5 correlation files
        inputFnameLs = self.getFilesWithProperSuffixFromFolder(
            self.inputFolder, suffix='.h5')
        inputData = self.registerAllInputFiles(workflow, inputFnameLs=inputFnameLs, input_site_handler=self.input_site_handler, \
              pegasusFolderName=self.pegasusFolderName)
        #organize final output plots by biology_category, biology_category_id2outputfolder
        sameCategoryPhenotypeMethodLs = db_250k.getPhenotypeMethodLsGivenBiologyCategoryID(
            self.biology_category_id, access=self.access)
        sameCategoryPhenotypeMethodIDLs = [
            pm.id for pm in sameCategoryPhenotypeMethodLs
        ]
        phenotype_method_id_ls = self.phenotype_method_id_ls + sameCategoryPhenotypeMethodIDLs
        result_list = db_250k.getResultLs(call_method_id=self.call_method_id, analysis_method_id_ls=self.analysis_method_id_ls, \
                phenotype_method_id_ls=phenotype_method_id_ls, cnv_method_id=self.cnv_method_id)
        result_id_ls = [result.id for result in result_list]
        sys.stderr.write("%s results.\n" % (len(result_id_ls)))
        result_peak_ls = db_250k.getResultPeakList(result_id_ls=result_id_ls, \
                  result_peak_type_id=self.result_peak_type_id)

        self.addJobs(workflow, result_peak_ls=result_peak_ls, inputData=inputData, datasetName=self.datasetName, chunkSize=self.chunkSize, \
           pegasusFolderName=self.pegasusFolderName)

        # Write the DAX to stdout
        outf = open(self.outputFname, 'w')
        workflow.writeXML(outf)
예제 #8
0
    def run(self):
        """
		2008-05-17
			-check_method_id_exists()
			if input_dir is dir:
				-submit_call_dir2db()
					-get_new_call_id()
						-get_cur_max_call_id()
			elif input_dir is file:
				-submit_call_file2db()
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        #database connection and etc
        db = self.db_250k
        session = db.session
        session.begin()

        chr_pos2db_id = db.getSNPChrPos2ID()

        #import MySQLdb
        #conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.user, passwd = self.passwd)
        #curs = conn.cursor()
        curs = None
        if not self.check_method_id_exists(db, self.method_id):
            sys.stderr.write("Warning: method_id=%s not in %s. A new entry to be created.\n"%\
                (self.method_id, self.call_method_table))
            cm = Stock_250kDB.CallMethod(
                short_name=self.call_method_short_name, id=self.method_id)
            session.add(cm)
            session.flush()

            self.method_id = cm.id

        if self.commit:
            self.submit_call2db(curs, self.input_dir, self.call_info_table, self.output_dir, self.method_id, self.db_user, \
                chr_pos2db_id=chr_pos2db_id, db=db)
            #curs.execute("commit")
            session.flush()
            session.commit()
    def getHistType(cls, call_method_id, min_distance, get_closest, min_MAF, allow_two_sample_overlapping, results_type,\
       null_distribution_type_id):
        """
		2008-11-08
			become a classmethod
		2008-10-16
		"""
        sys.stderr.write("Getting ScoreRankHistogramType ...")
        rows = Stock_250kDB.ScoreRankHistogramType.query.filter_by(call_method_id=call_method_id).\
          filter_by(min_distance=min_distance).filter_by(get_closest =get_closest).\
          filter(Stock_250kDB.ScoreRankHistogramType.min_MAF>=min_MAF-0.0001).filter(Stock_250kDB.ScoreRankHistogramType.min_MAF<=min_MAF+0.0001).\
          filter_by(allow_two_sample_overlapping = allow_two_sample_overlapping).filter_by(results_type=results_type).\
          filter_by(null_distribution_type_id=null_distribution_type_id)
        if rows.count() > 0:
            hist_type = rows.first()
        else:
            hist_type = Stock_250kDB.ScoreRankHistogramType(call_method_id=call_method_id, min_distance=min_distance,\
                   get_closest =get_closest,
                   min_MAF = min_MAF, results_type=results_type,
                   allow_two_sample_overlapping = allow_two_sample_overlapping, \
                   null_distribution_type_id=null_distribution_type_id)
        sys.stderr.write("Done.\n")
        return hist_type
예제 #10
0
	def runEnrichmentTestToGetNullData(self, session, pd):
		"""
		2008-11-04	stop checking TopSNPTestRMNullData if same run_no exists. the chance is small, however, this incurs a huge load on db server.
		2008-11-05
			return result and null_data to be sent over to output node to save them in batch in MpiTopSNPTest.py (output node connects to the master db.)
		2008-10-30
			run enrichment test, also to get NULL data based on either null distribution
		"""
		if self.debug:
			sys.stderr.write("Running Enrichment test on results_id=%s, list_type_id=%s, no_of_top_snps=%s, no_of_top_snps_ls=%s, \
							type_id=%s, min_score=%s, ... "%\
							(getattr(pd, 'results_id',-1), getattr(pd, 'list_type_id', -1), getattr(pd, 'no_of_top_snps', -1),\
							repr(getattr(pd, 'no_of_top_snps_ls', -1)), getattr(pd, 'type_id', -1), getattr(pd, 'min_score', -1)))
		
		ResultsClass = Stock_250kDB.ResultsMethod
		TestResultClass = Stock_250kDB.CandidateGeneTopSNPTestRM
		rm = ResultsClass.get(pd.results_id)
		min_distance = pd.min_distance
		min_MAF = pd.min_MAF
		get_closest = pd.get_closest
		
		no_of_top_snps_ls = getattr(pd, 'no_of_top_snps_ls', [])
		min_score_ls = getattr(pd, 'min_score_ls', [])
		if no_of_top_snps_ls:
			cutoff_ls = no_of_top_snps_ls
			cutoff_type = 1
		else:
			cutoff_ls = min_score_ls
			cutoff_type = 2
		commit = getattr(pd, 'commit', 0)	#2008-10-30 save objects right away
		
		if not rm:
			sys.stderr.write("No results available for results_id=%s.\n"%pd.results_id)
			return None
		
		
		
		candidate_gene_set = self.dealWithCandidateGeneList(pd.list_type_id, return_set=True)	#internal cache
		no_of_candidate_genes = len(candidate_gene_set)
		no_of_total_snps = None	#same for all tests from the same rm
		null_data_ls = []
		result_ls = []
		if pd.null_distribution_type_id==2 or pd.null_distribution_type_id==3:
			pd.need_permData = 1	#need to permData in getTestResult() even when the result is directly found in the database
			for i in range(pd.no_of_permutations):
				if pd.null_distribution_type_id==2:
					if no_of_total_snps is None:	#need to get it from the file
						if cutoff_type==1:
							pd.no_of_top_snps = cutoff_ls[0]
						elif cutoff_type==2:
							pd.min_score = cutoff_ls[0]
						return_data = self.getTestResult(session, rm, TestResultClass, pd)
						if return_data.permData:	#permData could be None
							no_of_total_snps = return_data.permData.no_of_total_snps
						else:
							if self.debug:
								sys.stderr.write("Warning: No permData from getTestResult(). aborted.\n")
							break
					shift = random.randint(1, no_of_total_snps)
					run_no = shift	#use this to link all NULL data under different no_of_top_snps together
				else:
					if no_of_candidate_genes>len(pd.total_gene_id_ls):
						if self.debug:
							sys.stderr.write("no_of_candidate_genes %s is bigger than no_of_total_genes, %s.\n"%\
											(no_of_candidate_genes, len(pd.total_gene_id_ls)))
						break
					random_candidate_gene_ls = random.sample(pd.total_gene_id_ls, no_of_candidate_genes)
					run_no = sum(random_candidate_gene_ls)%1000000	#take the sum of all gene ids and modulo to make it under 1 million. very little chance any two random gene lists have identical this number.
					random_candidate_gene_set = Set(random_candidate_gene_ls)
					random_candidate_gene_snp_gw_index_ls = None	#set it to None before every permutation
				for cutoff in cutoff_ls:
					if cutoff_type==1:
						pd.no_of_top_snps = cutoff
					elif cutoff_type==2:
						pd.min_score = cutoff
					return_data = self.getTestResult(session, rm, TestResultClass, pd)
					result = return_data.result
					permData = return_data.permData
					if result:
						if result.id is None:	#need to return this to save later
							result_ls.append(result)
						"""
						#2008-11-04	doesn't care repeating run_no. the chance is small, however, this incurs a huge load on db server.
						rows = Stock_250kDB.TopSNPTestRMNullData.query.\
									filter_by(observed_id=result.id).\
									filter_by(run_no=run_no).\
									filter_by(null_distribution_type_id=pd.null_distribution_type_id)
						if rows.count()>0:
							if self.debug:
								sys.stderr.write("null data for observed_id=%s, run_no=%s, null_distribution_type_id=%s already in db.\n"%\
												(result.id, run_no, pd.null_distribution_type_id))
							continue
						"""
						if len(result.null_data_ls)>pd.no_of_permutations:	#skip if it's too many already
							continue
						
						if pd.null_distribution_type_id==2:
							#indices of the top SNPs above a certain cutoff 
							top_snp_index_ls = numpy.hstack((permData.candidate_gene_snp_index_ls, permData.non_candidate_gene_snp_index_ls))
							#get corresponding (chr,pos)s of the top SNPs after they are shifted. 
							looped_chr_pos_ls = self.get_looped_chr_pos_ls(top_snp_index_ls, permData.no_of_total_snps, permData.total_chr_pos_ar, \
																	shift=shift)
							#after shifting (permutation), how many are close to candidate genes
							looped_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(candidate_gene_set, \
																						looped_chr_pos_ls, \
																						pd.snps_context_wrapper)
							new_candidate_sample_size = len(looped_candidate_gene_snp_index_ls)
							new_candidate_gw_size = result.candidate_gw_size	#same as observed
						else:
							top_snp_chr_pos_ls = permData.candidate_gene_snp_chr_pos_ls + permData.non_candidate_gene_snp_chr_pos_ls
							
							if random_candidate_gene_snp_gw_index_ls is None:	#2008-10-31 if it's None, generate it. same for every simulation
								random_candidate_gene_snp_gw_index_ls = self.get_candidate_gene_snp_index_ls(random_candidate_gene_set,\
																				permData.total_chr_pos_ar, pd.snps_context_wrapper)
							random_candidate_gene_snp_sample_index_ls = self.get_candidate_gene_snp_index_ls(random_candidate_gene_set, \
																				top_snp_chr_pos_ls, pd.snps_context_wrapper)
							new_candidate_sample_size = len(random_candidate_gene_snp_sample_index_ls)
							new_candidate_gw_size = len(random_candidate_gene_snp_gw_index_ls)
						null_data = Stock_250kDB.TopSNPTestRMNullData(observed=result,\
																	candidate_sample_size=new_candidate_sample_size,\
																	candidate_gw_size=new_candidate_gw_size,\
																	run_no=run_no,\
																	null_distribution_type_id=pd.null_distribution_type_id)
						null_data_ls.append(null_data)
						session.save(null_data)	#put in the session cache
						if commit:
							session.flush()
		elif pd.null_distribution_type_id==1:
			for cutoff in cutoff_ls:
				if cutoff_type==1:
					pd.no_of_top_snps = cutoff
				elif cutoff_type==2:
					pd.min_score = cutoff
				return_data = self.getTestResult(session, rm, TestResultClass, pd)
				if return_data.result and return_data.result.id is None:
						result_ls.append(return_data.result)
		
		else:
			sys.stderr.write("null_distribution_type %s not supported.\n"%(pd.null_distribution_type_id))
			return None
		if self.debug:
			sys.stderr.write("Done.\n")
		return_data = PassingData(result_ls=result_ls, null_data_ls=null_data_ls)
		return return_data
예제 #11
0
	def add2DB(self, db=None, short_name=None, phenotype_method_id=None, call_method_id=None, data_description=None, \
				method_description=None, comment=None, inputFname=None, user=None, results_method_type_id=None, \
				analysis_method_id=None, results_method_type_short_name=None, data_dir=None, commit=0,\
				cnv_method_id=None):
		"""
		2012.12.28 overhaul
		2012.6.6
			pass db to getOneResultJsonData()
		2012.3.9
			add locus_type_id to ResultsMethod
		2011-2-22
			add argument cnv_method_id
			deal with the association file format change. locus is now identified by Snps.id or CNV.id
		2010-5-3
			becomes classmethod
			store the json structure of top 10000 SNPs from the db_entry into db
		2008-09-30
			don't save results_method into database if bad thing happend when getting data out of the file.
		2008-09-09
			directly copy the result file if analysis_method_id==13
		2008-08-19
			automatically generate short_name if it's NULL
		2008-07-16
			adjust to new Elixir-based db api.
			new analysis_method_id is added to results_method.
		2008-05-30
			go to output_dir
			drop copyAndReformatResultFile()
			use store_file()
		2008-05-26
			add results_method_type_id and results_method_type_short_name
		2008-05-24
			to conveniently wrap up all codes so that both this program and plone can call
		"""
		session = db.session
		session.begin()
		
		
		rmt = Stock_250kDB.ResultsMethodType.get(results_method_type_id)
		if not rmt and results_method_type_short_name is not None:	#create a new results method type
			rmt = Stock_250kDB.ResultsMethodType(short_name=results_method_type_short_name)
			session.add(rmt)
		
		if not rmt:
			sys.stderr.write("No results method type available for results_method_type_id=%s.\n"%results_method_type_id)
			sys.exit(3)
		
		if call_method_id:	#2012.6.6
			cm = Stock_250kDB.CallMethod.query.get(call_method_id)
			locus_type_id = cm.locus_type_id
		else:
			cm = None
			locus_type_id = None
		
		db_entry = db.checkResultsMethod(call_method_id=call_method_id, phenotype_method_id=phenotype_method_id, \
										analysis_method_id=analysis_method_id, \
			cnv_method_id=cnv_method_id, accession_set_id=None, results_method_type_id=results_method_type_id)
		if db_entry:
			sys.stderr.write("There is already an entry in results_method (id=%s) with same (call_method_id, phenotype_method_id, analysis_method_id, results_method_type_id)=(%s, %s, %s, %s).\n"\
							%(db_entry.id, call_method_id, phenotype_method_id, analysis_method_id, results_method_type_id))
			sys.exit(2)
		
		db_entry = db.getResultsMethod(data_dir=data_dir, call_method_id=call_method_id, phenotype_method_id=phenotype_method_id, \
									analysis_method_id=analysis_method_id, \
			cnv_method_id=cnv_method_id, accession_set_id=None, results_method_type_id=results_method_type_id,\
			method_description=method_description, no_of_accessions=None, \
			no_of_loci=None, filename=None, original_filename=inputFname, \
			data_description=data_description, comment=comment, created_by=user, locus_type_id=locus_type_id)	#2012.3.9
		
		if commit:
			db_entry.filename = os.path.join(db.data_dir, db_entry.constructRelativePath(data_dir=data_dir))
			localAbsPath = os.path.join(data_dir, db_entry.constructRelativePath(data_dir=data_dir))
			
			if db_entry.analysis_method_id==13:
				self.srcFilenameLs.append(inputFname)
				self.dstFilenameLs.append(localAbsPath)
				exit_code = self.copyResultsFile(db, inputFname, db_entry, user=user, output_fname=localAbsPath)
			else:
				#2013.1.10 add some db_entry attributes to the hdf5 file
				db.addAttributesToResultFile(db_entry=db_entry, inputFname=inputFname)
				inputFileBasename = os.path.basename(inputFname)
				#moveFileIntoDBAffiliatedStorage() will also set db_entry.path
				exit_code = db.moveFileIntoDBAffiliatedStorage(db_entry=db_entry, filename=inputFileBasename, \
									inputDir=os.path.split(inputFname)[0], \
									outputDir=data_dir,\
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=db_entry.constructRelativePath, data_dir=data_dir)
				
				#exit_code = self.copyAndReformatResultFile(db, inputFname, db_entry, user=user, output_fname=localAbsPath)
			if exit_code==0:
				session.add(db_entry)
				if db_entry.file_size is None:
					db.updateDBEntryPathFileSize(db_entry=db_entry, data_dir=data_dir)
				if db_entry.md5sum is None:
					db.updateDBEntryMD5SUM(db_entry=db_entry, data_dir=data_dir)
				
				# 2010-5-3 store the json structure of top 10000 SNPs from the db_entry into db
				no_of_top_snps = 10000
				if db_entry.analysis_method.min_maf is not None:
					min_MAF = db_entry.analysis_method.min_maf
				else:
					min_MAF = 0
				try:
					#2011-2-24
					if call_method_id:	#call method, snp dataset
						db_id2chr_pos = db.snp_id2chr_pos
					elif cnv_method_id:
						if db._cnv_method_id!=cnv_method_id:
							db.cnv_id2chr_pos = cnv_method_id
						db_id2chr_pos = db.cnv_id2chr_pos
					pdata = PassingData(db_id2chr_pos=db_id2chr_pos)
					json_data = db.getOneResultJsonData(result_id=db_entry.id, min_MAF=min_MAF, no_of_top_snps=no_of_top_snps, \
													pdata=pdata, data_dir=data_dir)	#2011-2-24 pass pdata to getOneResultJsonData()
					rm_json = Stock_250kDB.ResultsMethodJson(min_maf=min_MAF, no_of_top_snps=no_of_top_snps)
					rm_json.result = db_entry
					rm_json.json_data = json_data
					session.add(rm_json)
				except:
					sys.stderr.write('Except in saving results_method_json (aborted): %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
					session.rollback()
					self.cleanUpAndExitOnFailure(exitCode=3)
			else:	#bad thing happend when getting data out of the file. don't save this results_method.
				session.delete(db_entry)
				sys.stderr.write("Error: copy file from %s to %s failed.\n"%(inputFname, localAbsPath ))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=3)
			session.flush()
			session.commit()
		else:	#default is also rollback(). to demonstrate good programming
			session.rollback()
		self.reset_marker_pos2snp_id()
예제 #12
0
    def run(self):
        """
		
		"""
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        if self.debug:
            import pdb
            pdb.set_trace()
        chr_pos2ancestral_allele = self.get_chr_pos2ancestral_allele(
            self.ancestral_allele_fname)
        pheno_data = SNPData(input_fname=self.phenotype_fname,
                             turn_into_integer=0,
                             ignore_2nd_column=1)
        pheno_data = self.process_phenotype_data(pheno_data)

        geno_data = SNPData(input_fname=self.genotype_fname,
                            turn_into_array=1,
                            matrix_data_type=int,
                            ignore_2nd_column=1)

        query = Stock_250kDB.ResultsMethod.query.filter_by(
            call_method_id=self.call_method_id).filter_by(
                analysis_method_id=self.analysis_method_id).filter_by(
                    phenotype_method_id=self.phenotype_method_id)
        if query.count() == 1:
            rm = query.first()
        elif query.count() > 1:
            sys.stderr.write(
                "Warning: more than 1 results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"
                % (self.call_method_id, self.analysis_method_id,
                   self.phenotype_method_id))
            rm = query.first()
        else:
            sys.stderr.write(
                "Error: no results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"
                % (self.call_method_id, self.analysis_method_id,
                   self.phenotype_method_id))
            sys.exit(3)

        phenotype_ls_data = self.get_phenotype_ls(rm, self.no_of_top_snps, chr_pos2ancestral_allele, pheno_data, geno_data, \
                  self.min_MAF, results_directory=self.input_dir)

        import pylab
        pylab.clf()
        hist_patch_ls = []
        legend_ls = []
        if len(phenotype_ls_data.ancestral_allele_phenotype_ls) > 2:
            n1 = pylab.hist(phenotype_ls_data.ancestral_allele_phenotype_ls,
                            100,
                            alpha=0.4,
                            normed=1)
            hist_patch_ls.append(
                n1[2][0])  #first patch in all patches of a histogram
            legend_ls.append('ancestral allele')
        if len(phenotype_ls_data.derived_allele_phenotype_ls) > 2:
            n2 = pylab.hist(phenotype_ls_data.derived_allele_phenotype_ls,
                            100,
                            alpha=0.4,
                            normed=1,
                            facecolor='r')
            hist_patch_ls.append(n2[2][0])
            legend_ls.append('derived allele')
        pylab.legend(hist_patch_ls, legend_ls)
        if self.output_fname_prefix:
            pylab.savefig('%s.svg' % self.output_fname_prefix, dpi=300)
예제 #13
0
	def run(self):
		"""
		2008-08-19
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db_250k = db
		session = db.session
		
		total_gene_id_ls = get_total_gene_ls(db.metadata.bind)
		no_of_total_genes = len(total_gene_id_ls)
		#no_of_total_genes = self.getNoOfTotalGenes(db, self.gene_table, self.tax_id)
		
		#if self.commit:
		#	session.begin()
		_type = self.getTopSNPTestType(self.get_closest, self.min_MAF, \
									self.allow_two_sample_overlapping, self.results_type,\
									self.test_type_id, self.null_distribution_type_id)
		
		snps_context_wrapper = self.dealWithSnpsContextWrapper(self.snps_context_picklef, self.min_distance, self.get_closest)
		pd = PassingData(list_type_id=self.list_type_id,
						snps_context_wrapper=snps_context_wrapper, \
						no_of_total_genes=no_of_total_genes,
						results_directory=self.results_directory, \
						min_MAF=self.min_MAF,
						get_closest=self.get_closest,
						min_distance=self.min_distance,\
						no_of_top_snps=self.no_of_top_snps,
						min_sample_size=self.min_sample_size,
						test_type_id=self.test_type_id, \
						results_type=self.results_type,
						no_of_permutations=self.no_of_permutations,\
						no_of_min_breaks=self.no_of_min_breaks,
						type_id=_type.id,\
						null_distribution_type_id=self.null_distribution_type_id,\
						allow_two_sample_overlapping=self.allow_two_sample_overlapping,
						total_gene_id_ls=total_gene_id_ls,\
						min_score=self.min_score,
						commit=self.commit)
		if getattr(self, 'output_fname', None):
			writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t')
			header_row = []
			for column in Stock_250kDB.CandidateGeneTopSNPTest.c.keys():
				header_row.append(column)
			writer.writerow(header_row)
		else:
			writer = None
		
		#2008-10-31 setting up list accordingly
		if self.min_score:
			pd.min_score_ls = [self.min_score]
		else:
			pd.no_of_top_snps_ls = [self.no_of_top_snps]
		for results_id in self.results_id_ls:
			pd.results_id = results_id
			#self.runEnrichmentTestToGetNullData(session, pd)
			
			return_data = self.runHGTest(pd)
			result = return_data.result_ls[0]
			if result is not None:
				result.type = _type	#assign the type here
				row = []
				for column in result.c.keys():
					row.append(getattr(result, column))
					print '%s: %s'%(column, row[-1])
				if writer:
					writer.writerow(row)
				session.save(result)
				if self.commit:
					session.flush()
    def saveDataIntoDB(self, session, genome_wide_result_ls, hist_type, threshold_type, pvalue_matching_data, list_type_id, \
        r1_pvalue_cutoff=3, r2_pvalue_cutoff=5, null_distribution_type_id=1, candidate_gene_set=set(), snps_context_wrapper=None):
        """
		2009-10-3
			If null_distribution_type_id=2, calculate the permutation pvalue before saving into db.
			If null_distribution_type_id=1, pvalue = None. maybe 2X2 table test (Fisher test) 
		2009-4-16
		"""
        sys.stderr.write("Saving enrichment data into db ...\n")
        results_id1 = genome_wide_result_ls[0].results_id
        results_id2 = genome_wide_result_ls[1].results_id

        pvalue_int_pair_set = set(
            pvalue_matching_data.pvalue_int_pair2count_non_candidate.keys())
        pvalue_int_pair_set.update(
            set(pvalue_matching_data.pvalue_int_pair2count_candidate.keys()))
        for pvalue_int_pair in pvalue_int_pair_set:
            if pvalue_int_pair[0] == 0:
                r1_min_score = 0
                r1_max_score = r1_pvalue_cutoff
            else:
                r1_min_score = r1_pvalue_cutoff
                r1_max_score = None
            if pvalue_int_pair[1] == 0:
                r2_min_score = 0
                r2_max_score = r2_pvalue_cutoff
            else:
                r2_min_score = r2_pvalue_cutoff
                r2_max_score = None
            candidate_sample_size = pvalue_matching_data.pvalue_int_pair2count_candidate.get(
                pvalue_int_pair)
            non_candidate_sample_size = pvalue_matching_data.pvalue_int_pair2count_non_candidate.get(
                pvalue_int_pair)
            candidate_gw_size = len(
                pvalue_matching_data.pvalue_ls1_in_candidate)
            non_candidate_gw_size = len(
                pvalue_matching_data.pvalue_ls1_in_non_candidate)
            if candidate_sample_size is not None and non_candidate_sample_size is not None and candidate_gw_size > 0 and non_candidate_sample_size > 0:
                enrichment_ratio = (
                    candidate_sample_size * non_candidate_gw_size) / float(
                        non_candidate_sample_size * candidate_gw_size)
            else:
                enrichment_ratio = None

            ### 2009-10-2
            if null_distribution_type_id == 1:
                pvalue = None  # need to figure out a way to calculate the pvalue , maybe 2X2 table test (Fisher test)
            elif null_distribution_type_id == 2:
                cand_snp_index_ls = pvalue_matching_data.pvalue_int_pair2cand_snp_index_ls.get(
                    pvalue_int_pair, numpy.array([], numpy.int)
                )  # without numpy.array around [], [] would be treated as numpy.float64 by default.
                non_cand_snp_index_ls = pvalue_matching_data.pvalue_int_pair2non_cand_snp_index_ls.get(
                    pvalue_int_pair, numpy.array([], numpy.int))
                top_snp_index_ls = numpy.hstack(
                    (cand_snp_index_ls, non_cand_snp_index_ls))
                if len(top_snp_index_ls) == 0:
                    pvalue = None
                else:
                    return_data = self.get_enrichment_pvalue_by_gw_looping(candidate_sample_size, top_snp_index_ls, candidate_gene_set, \
                          snps_context_wrapper, \
                          pvalue_matching_data.no_of_total_snps, total_chr_pos_ar=pvalue_matching_data.total_chr_pos_ar, \
                          no_of_permutations=20000, no_of_min_breaks=30)
                    pvalue = return_data.pvalue
                    no_of_tests = return_data.no_of_tests
                    no_of_tests_passed = return_data.no_of_tests_passed

            entry = Stock_250kDB.CmpEnrichmentOfTwoAnalysisMethods(results_id1=results_id1, results_id2=results_id2, list_type_id=list_type_id,\
                      type=hist_type, r1_min_score=r1_min_score, r1_max_score=r1_max_score,\
                      r2_min_score=r2_min_score, r2_max_score=r2_max_score,\
                      candidate_sample_size=candidate_sample_size, non_candidate_sample_size=non_candidate_sample_size,\
                      candidate_gw_size=candidate_gw_size, non_candidate_gw_size=non_candidate_gw_size,\
                      enrichment_ratio=enrichment_ratio, pvalue=pvalue)
            entry.threshold_type = threshold_type
            session.save(entry)
            #session.flush()
        sys.stderr.write("Done.\n")
예제 #15
0
    def findCNVcontext(self,
                       db_250k,
                       genomeRBDict,
                       cnv_method_id=None,
                       compareIns=None,
                       max_distance=50000,
                       debug=0,
                       param_obj=None):
        """
		2011-3-25
			cast row.chromosome (from db) into str type.
		2010-10-3
			bug fixed: (chr, start, stop) is not unique. There are genes with the same coordinates.
		2010-8-18
		"""
        sys.stderr.write("Finding CNV context ... \n")
        session = db_250k.session
        TableClass = Stock_250kDB.CNV
        query = TableClass.query.filter_by(cnv_method_id=cnv_method_id)
        for row in query:
            segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=str(row.chromosome), \
                span_ls=[row.start, row.stop], \
                min_reciprocal_overlap=0.0000001, ) #min_reciprocal_overlap doesn't matter here.
            # it's decided by compareIns.
            node_ls = []
            genomeRBDict.findNodes(segmentKey,
                                   node_ls=node_ls,
                                   compareIns=compareIns)
            for node in node_ls:
                geneSegKey = node.key
                for oneGeneData in node.value:
                    # geneSegKey.span_ls expands 20kb upstream or downstream of the gene.
                    overlapData = get_overlap_ratio(segmentKey.span_ls, \
                              [oneGeneData.gene_start, oneGeneData.gene_stop])
                    overlapFraction1 = overlapData.overlapFraction1
                    overlapFraction2 = overlapData.overlapFraction2
                    overlap_length = overlapData.overlap_length
                    overlap_start_pos = overlapData.overlap_start_pos
                    overlap_stop_pos = overlapData.overlap_stop_pos
                    if overlap_length > 0:  #use fraction of length as coordinates.
                        gene_length = oneGeneData.gene_stop - oneGeneData.gene_start + 1
                        try:
                            if oneGeneData.strand == '+1':
                                term5_disp_pos = abs(overlap_start_pos -
                                                     oneGeneData.gene_start
                                                     ) / float(gene_length)
                                term3_disp_pos = abs(overlap_stop_pos -
                                                     oneGeneData.gene_start +
                                                     1) / float(gene_length)
                            else:
                                term5_disp_pos = abs(oneGeneData.gene_stop -
                                                     overlap_stop_pos) / float(
                                                         gene_length)
                                term3_disp_pos = abs(oneGeneData.gene_stop -
                                                     overlap_start_pos +
                                                     1) / float(gene_length)
                        except:
                            import pdb
                            pdb.set_trace()
                    else:  #no overlap at all
                        term3_disp_pos = None
                        if oneGeneData.strand == '+1':
                            if row.stop <= oneGeneData.gene_start:  #upstream
                                term5_disp_pos = row.stop - oneGeneData.gene_start
                            elif row.start >= oneGeneData.gene_stop:  # downstream
                                term5_disp_pos = row.start - oneGeneData.gene_stop
                        else:
                            if row.stop <= oneGeneData.gene_start:  #downstream
                                term5_disp_pos = oneGeneData.gene_start - row.stop
                            elif row.start >= oneGeneData.gene_stop:  # upstream
                                term5_disp_pos = oneGeneData.gene_stop - row.start
                    cnv_context = Stock_250kDB.CNVContext.query.filter_by(
                        cnv_id=row.id).filter_by(
                            gene_id=oneGeneData.gene_id).first()

                    if cnv_context:
                        param_obj.no_of_cnv_contexts_already_in_db += 1
                    else:
                        cnv_context = Stock_250kDB.CNVContext(cnv_id=row.id, gene_id = oneGeneData.gene_id, \
                                gene_strand=oneGeneData.strand, term5_disp_pos=term5_disp_pos, \
                                term3_disp_pos=term3_disp_pos,\
                                overlap_length=overlap_length, \
                                overlap_fraction_in_cnv=overlapFraction1, overlap_fraction_in_gene=overlapFraction2)
                        session.add(cnv_context)
                        param_obj.no_of_into_db += 1
                    param_obj.no_of_total_contexts += 1

                    for geneCommentaryRBDict in oneGeneData.geneCommentaryRBDictLs:
                        gene_box_node_ls = []
                        geneCommentaryRBDict.findNodes(
                            segmentKey,
                            node_ls=gene_box_node_ls,
                            compareIns=compareIns)
                        for gene_box_node in gene_box_node_ls:
                            gene_box_key = gene_box_node.key
                            overlapData = get_overlap_ratio(
                                segmentKey.span_ls, gene_box_key.span_ls)
                            overlapFraction1 = overlapData.overlapFraction1
                            overlapFraction2 = overlapData.overlapFraction2
                            overlap_length = overlapData.overlap_length
                            overlap_start_pos = overlapData.overlap_start_pos
                            overlap_stop_pos = overlapData.overlap_stop_pos
                            cnv_annotation = Stock_250kDB.CNVAnnotation.query.filter_by(cnv_id=row.id).filter_by(cnv_context_id=cnv_context.id).\
                             filter_by(gene_commentary_id= geneCommentaryRBDict.gene_commentary_id).\
                             filter_by(gene_segment_id= gene_box_key.gene_segment_id).first()
                            if cnv_annotation:
                                param_obj.no_of_cnv_annotations_already_in_db += 1
                            else:
                                cnv_annotation = Stock_250kDB.CNVAnnotation(cnv_id=row.id, \
                                    gene_commentary_id = geneCommentaryRBDict.gene_commentary_id, \
                                    gene_segment_id=gene_box_key.gene_segment_id, label=gene_box_key.label, \
                                    utr_number = gene_box_key.utr_number, cds_number = gene_box_key.cds_number, \
                                    intron_number = gene_box_key.intron_number, exon_number = gene_box_key.exon_number,\
                                    overlap_length=overlap_length, \
                                    overlap_fraction_in_cnv=overlapFraction1, overlap_fraction_in_gene=overlapFraction2)
                                cnv_annotation.cnv_context = cnv_context
                                session.add(cnv_annotation)
                                param_obj.no_of_into_db += 1
                            param_obj.no_of_total_annotations += 1

                    if param_obj.no_of_into_db > 2000:
                        session.flush()
                        param_obj.no_of_into_db = 0
                        sys.stderr.write("\t %s/%s CNVContext(s) & %s/%s CNVAnnotation(s) already in db.\n"%(\
                           param_obj.no_of_cnv_contexts_already_in_db, param_obj.no_of_total_contexts, \
                           param_obj.no_of_cnv_annotations_already_in_db, param_obj.no_of_total_annotations))
        session.flush()
        session.expunge_all()
        sys.stderr.write("\t %s/%s CNVContext(s) & %s/%s CNVAnnotation(s) already in db.\n"%(\
              param_obj.no_of_cnv_contexts_already_in_db, param_obj.no_of_total_contexts, \
              param_obj.no_of_cnv_annotations_already_in_db, param_obj.no_of_total_annotations))
예제 #16
0
    def submit_StrainxSNP_file2db(self,
                                  curs,
                                  input_fname,
                                  call_info_table,
                                  output_dir,
                                  method_id,
                                  user,
                                  chr_pos2db_id=None,
                                  **keywords):
        """
		2011-5-2
			curs is completely useless. keywords must contain the "db" key now.
		2011-2-27
			input file could use either db_id or chr_pos to identify locus.
		2010-10-13
			add argument chr_pos2db_id and **keywords
			it replaces the old snp ID (chr_pos_...) with id from table Stock_250kDB.Snps
		2008-1-5
			if the output_fname already exists, exit the program.
			if db insertion fails, delete the file written out and exit the program.
		2008-05-19
			submit the calls from a matrix file (Strain X SNP format, tsv, nucleotides in numbers) to db
		"""
        sys.stderr.write("Submitting %s to db ...\n" % (input_fname))
        db = keywords.get("db")  #2011-5-2
        output_dir = os.path.join(output_dir, 'method_%s' % method_id)
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        #2013.1.18 set priorTAIRVersion =True, locus_type_id=1, because the input is in TAIR8
        db.chr_pos2snp_id = (True, 1)

        reader = csv.reader(openGzipFile(input_fname), delimiter='\t')
        header = reader.next()
        counter = 0
        for row in reader:
            ecotype_id, array_id = row[:2]
            sys.stderr.write("%s\tAssign new call info id to array id=%s ." %
                             ('\x08' * 80, array_id))

            call_info = Stock_250kDB.CallInfo(array_id=array_id,
                                              method_id=method_id,
                                              created_by=user)
            db.session.add(call_info)
            db.session.flush()
            output_fname = os.path.join(output_dir,
                                        '%s_call.tsv' % call_info.id)
            call_info.filename = output_fname
            db.session.add(call_info)
            db.session.flush()

            #new_call_id = self.get_new_call_id(curs, call_info_table, array_id, method_id)
            #if new_call_id!=-1:

            if os.path.isfile(output_fname):
                sys.stderr.write(
                    "Error: %s already exists. Check why the file exists while db has no record.\n"
                    % output_fname)
                sys.exit(2)
            writer = csv.writer(openGzipFile(output_fname, 'w'),
                                delimiter='\t')
            writer.writerow(['SNP_ID', array_id])
            for i in range(2, len(row)):
                snp_id = header[i]
                if chr_pos2db_id:  #2010-10-13
                    db_id = db.get_db_id_given_chr_pos2db_id(snp_id)
                else:
                    db_id = snp_id
                if db_id is not None:
                    writer.writerow([db_id,
                                     number2nt[int(row[i])]])  #translate
            del writer
            """
			try:
				
				#curs.execute("insert into %s(id, filename, array_id, method_id, created_by) values (%s, '%s', %s, %s, '%s')"%\
				#			(call_info_table, new_call_id, output_fname, array_id, method_id, user))
			except:
				traceback.print_exc()
				sys.stderr.write('%s.\n'%repr(sys.exc_info()))
				sys.stderr.write('Error encountered while inserting record into %s. Delete the file written.\n'%call_info_table)
				commandline = 'rm %s'%(output_fname)
				command_out = runLocalCommand(commandline)
				sys.exit(3)
			"""
            counter += 1
        del reader
        sys.stderr.write(" %s arrays. Done.\n" % counter)
    def run(self):
        """
		2008-12-08 if the plot under configuration is already in db, abort only if the program is gonna commit the database transaction.
		2008-10-19
			save figures in database if commit
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.db_user,
                                       password=self.db_passwd,
                                       hostname=self.hostname,
                                       database=self.dbname,
                                       schema=self.schema)
        db.setup(create_tables=False)
        session = db.session
        #session.begin()

        if self.results_type == 1:
            ResultsClass = Stock_250kDB.ResultsMethod
            snps_context_wrapper = self.dealWithSnpsContextWrapper(
                self.snps_context_picklef, self.min_distance, self.get_closest)
        elif self.results_type == 2:
            ResultsClass = Stock_250kDB.ResultsByGene
        else:
            sys.stderr.write("Invalid results type : %s.\n" %
                             self.results_type)
            return None

        hist_type = self.getHistType(self.call_method_id, self.min_distance, self.get_closest, self.min_MAF, \
               self.allow_two_sample_overlapping, self.results_type, self.null_distribution_type_id)

        candidate_gene_list = self.getGeneList(self.list_type_id)
        if len(candidate_gene_list) < self.min_sample_size:
            sys.stderr.write("Candidate gene list of %s too small: %s.\n" %
                             (self.list_type_id, len(candidate_gene_list)))
            sys.exit(4)
        #candidate_gene_list = []		#2009-01-12 just to plot the histogram of pvalue

        candidate_gene_set = Set(candidate_gene_list)
        list_type = Stock_250kDB.GeneListType.get(self.list_type_id)
        if list_type is None:
            sys.exit(3)

        phenotype_id2results_id_ls = self.getResultsIDLs(db, ResultsClass, self.results_type, self.phenotype_id_ls, \
                    self.min_distance, self.get_closest, self.min_MAF, self.call_method_id)


        param_data = PassingData(results_directory=self.data_dir, candidate_gene_list=candidate_gene_list, \
         min_MAF=self.min_MAF, allow_two_sample_overlapping=self.allow_two_sample_overlapping, need_the_value=1, \
         do_log10_transformation=False, data_dir=self.data_dir)
        #need_the_value means to get the pvalue/score
        #force no log10 transformation. otherwise, transformation based on analysis_method
        if self.null_distribution_type_id == 2 or self.null_distribution_type_id == 3:  #gw-looping or random gene list
            snp_info = DrawSNPRegion.getSNPInfo(db)
            chr_pos_ls = [(row.chromosome, row.position)
                          for row in snp_info.data_ls]
            candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(
                candidate_gene_set, chr_pos_ls, snps_context_wrapper)
            no_of_snps = len(snp_info.data_ls)
            no_of_permutations = no_of_snps / len(
                candidate_gene_snp_index_ls) + 1
            param_data.chr_pos2index = snp_info.chr_pos2index  #pass to getGenomeWideResultFromFile
            if self.null_distribution_type_id == 2:
                non_candidate_gene_snp_index_ls = self.get_non_candidate_gene_snp_index_ls_by_permutation(
                    candidate_gene_snp_index_ls, no_of_snps,
                    no_of_permutations)

            elif self.null_distribution_type_id == 3:
                gene_id_ls = get_total_gene_ls(db.metadata.bind)
                no_of_candidate_genes = len(candidate_gene_set)
                non_candidate_gene_snp_index_ls = numpy.zeros(0, numpy.int)
                while len(non_candidate_gene_snp_index_ls) < no_of_snps:
                    non_candidate_gene_set = Set(
                        random.sample(gene_id_ls, no_of_candidate_genes))
                    _non_candidate_gene_snp_index_ls = self.get_candidate_gene_snp_index_ls(
                        non_candidate_gene_set, chr_pos_ls,
                        snps_context_wrapper)
                    non_candidate_gene_snp_index_ls = numpy.hstack(
                        (non_candidate_gene_snp_index_ls,
                         _non_candidate_gene_snp_index_ls))

        for phenotype_id, results_id_ls in phenotype_id2results_id_ls.iteritems(
        ):
            if hist_type.id:  #hist_type already in database
                rows = Stock_250kDB.ScoreRankHistogram.query.filter_by(phenotype_method_id=phenotype_id).\
                 filter_by(list_type_id=self.list_type_id).filter_by(hist_type_id=hist_type.id)
                if rows.count(
                ) > 0 and self.commit:  #2008-12-08 only skip if the database transaction is gonna commit.
                    row = rows.first()
                    sys.stderr.write("Histogram already in database. id=%s, phenotype_id=%s, list_type_id=%s, hist_type_id=%s.\n"%\
                        (row.id, row.phenotype_method_id, row.list_type_id, row.hist_type_id))
                    continue
            phenotype_method = Stock_250kDB.PhenotypeMethod.get(phenotype_id)
            if not phenotype_method:
                continue
            score_rank_data_ls = []
            sys.stderr.write("Checking phenotype %s (%s) on list_type %s (%s) ...\n"%\
                (phenotype_method.id, phenotype_method.short_name, list_type.id, list_type.short_name))

            for results_id in results_id_ls:
                try:
                    rm = ResultsClass.get(results_id)
                    score_rank_data = None
                    if self.null_distribution_type_id == 1:
                        if self.results_type == 1:
                            permData = self.prepareDataForPermutationRankTest(
                                rm, snps_context_wrapper, param_data)
                            if not permData:
                                continue
                            score_rank_data = PassingData(candidate_score_ls=permData.candidate_gene_snp_value_ls, \
                                  candidate_rank_ls=permData.candidate_gene_snp_rank_ls,\
                              non_candidate_score_ls=permData.non_candidate_gene_snp_value_ls, non_candidate_rank_ls=permData.non_candidate_gene_snp_rank_ls,\
                              analysis_method=rm.analysis_method)
                            del permData
                        elif self.results_type == 2:
                            score_rank_data = self.getScoreRankFromRBG(
                                rm, candidate_gene_set, self.data_dir)
                    elif self.null_distribution_type_id == 2 or self.null_distribution_type_id == 3:
                        genome_wide_result = db.getResultMethodContent(
                            rm.id,
                            param_data.results_directory,
                            param_data.min_MAF,
                            pdata=param_data)
                        if not genome_wide_result:
                            continue
                        score_rank_data = self.getScoreRankFromPermIndexLs(
                            genome_wide_result, candidate_gene_snp_index_ls,
                            non_candidate_gene_snp_index_ls)
                        if score_rank_data:
                            score_rank_data.analysis_method = rm.analysis_method

                    if score_rank_data:
                        score_rank_data_ls.append(score_rank_data)
                except:
                    sys.stderr.write(
                        "Exception happened for results_id=%s, phenotype_id=%s.\n"
                        % (results_id, phenotype_id))
                    traceback.print_exc()
                    sys.stderr.write('%s.\n' % repr(sys.exc_info()))
                    continue
            if score_rank_data_ls:

                score_png_data, score_svg_data = self.plotHistForOnePhenotype(
                    phenotype_method,
                    list_type,
                    score_rank_data_ls,
                    self.output_dir,
                    data_type='score',
                    commit=self.commit)
                rank_png_data, rank_svg_data = self.plotHistForOnePhenotype(
                    phenotype_method,
                    list_type,
                    score_rank_data_ls,
                    self.output_dir,
                    data_type='rank',
                    commit=self.commit)
                if self.commit:
                    score_rank_hist = Stock_250kDB.ScoreRankHistogram(
                        phenotype_method_id=phenotype_id,
                        list_type_id=list_type.id)
                    score_rank_hist.hist_type = hist_type
                    score_rank_hist.score_hist = score_png_data.getvalue()
                    score_rank_hist.score_hist_svg = score_svg_data.getvalue()
                    score_rank_hist.rank_hist = rank_png_data.getvalue()
                    score_rank_hist.rank_hist_svg = rank_svg_data.getvalue()
                    session.save(score_rank_hist)
                    session.flush()
                    del score_png_data, score_svg_data, rank_png_data, rank_svg_data
        """
예제 #18
0
    def run(self):
        """
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""
        #database connection and etc
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, self.QCMethod_class)
        qm = self.QCMethod_class.query.get(self.QC_method_id)  #2009-5-20

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs
        if self.debug:
            import pdb
            pdb.set_trace()

        readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README)
        session.add(readme)

        QC_method_id2snps_table = self.QC_method_id2snps_table

        if self.QC_method_id == 0:
            self.cal_independent_NA_rate(db, self.min_probability, readme)
            row_id2NA_mismatch_rate = None
        else:
            #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename, ignore_het=qm.ignore_het)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id),\
                ignore_het=qm.ignore_het) #category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided.
            """
			if self.input_dir and os.path.isdir(self.input_dir):
				#04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
				#no submission to db
				call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
			"""
            if self.input_dir and os.path.isfile(self.input_dir):  #it's file
                call_info_id2fname = None
            else:
                if self.run_type == 2:  #no filtering on call_info entries that have been QCed.
                    filter_calls_QCed = 0
                elif self.run_type == 1:
                    filter_calls_QCed = 1
                    self.max_call_info_mismatch_rate = 1  #don't use this when doing accession-wise QC
                else:
                    sys.stderr.write("run_type=%s is not supported.\n" %
                                     self.run_type)
                    sys.exit(5)
                call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
                          filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug,\
                          min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir)
                call_info_id2fname = call_data.call_info_id2fname
                call_info_ls_to_return = call_data.call_info_ls_to_return
            if self.run_type == 2:
                snps_name2snps_id = self.get_snps_name2snps_id(db)
            else:
                snps_name2snps_id = None

            if call_info_id2fname:
                db_id2chr_pos = db.getSNPID2ChrPos()  #2011-22
                from DB_250k2data import DB_250k2Data
                db_id2index = DB_250k2Data.getSNPID2index(
                    call_info_id2fname.values()[0][1], db_id2chr_pos)
                if self.one_by_one and self.run_type == 1:  #one_by_one only for QC by accession
                    row_id2NA_mismatch_rate = {}
                    row_id12row_id2 = {}
                    counter = 0
                    for call_info_id, value in call_info_id2fname.iteritems():
                        counter += 1
                        print "No", counter
                        tmp_dict = {}
                        tmp_dict[call_info_id] = value
                        pdata = self.read_call_matrix(
                            tmp_dict,
                            self.min_probability,
                            db_id2chr_pos=db_id2chr_pos,
                            db_id2index=db_id2index)
                        #05/20/09 no need for qm.ignore_het because 250k is all h**o
                        passingdata = self.qcDataMatrixVSsnpData(
                            pdata, snps_name2snps_id, snpData2, curs, session,
                            readme)
                        row_id2NA_mismatch_rate.update(
                            passingdata.row_id2NA_mismatch_rate)
                        row_id12row_id2.update(passingdata.row_id12row_id2)
                        del pdata

                        if self.debug and counter == 10:
                            break
                else:
                    pdata = self.read_call_matrix(call_info_id2fname,
                                                  self.min_probability,
                                                  db_id2chr_pos=db_id2chr_pos,
                                                  db_id2index=db_id2index)
                    #05/20/09 no need for qm.ignore_het because 250k is all h**o
                    passingdata = self.qcDataMatrixVSsnpData(
                        pdata, snps_name2snps_id, snpData2, curs, session,
                        readme)
                    row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                    row_id12row_id2 = passingdata.row_id12row_id2
                    del pdata
            else:
                #input file is SNP by strain format. double header (1st two lines)
                header, snps_name_ls, category_list, data_matrix = read_data(
                    self.input_dir, double_header=1, ignore_het=qm.ignore_het)
                pdata = PassingData()
                pdata.ecotype_id_ls = header[0][2:]
                pdata.call_info_id_ls = header[1][2:]
                data_matrix = numpy.array(data_matrix)
                pdata.data_matrix = data_matrix.transpose()
                pdata.header = ['', ''
                                ] + snps_name_ls  #fake a header for SNPData
                passingdata = self.qcDataMatrixVSsnpData(
                    pdata, snps_name2snps_id, snpData2, curs, session, readme)
                row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                row_id12row_id2 = passingdata.row_id12row_id2
                del pdata

        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, \
                 row_id12row_id2, self.call_method_id, readme)
        if self.commit:
            curs.execute("commit")
            session.commit()
        else:
            session.rollback()

        self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate  #for plone to get the data structure