Пример #1
0
	def orderListTypeAnalysisMethodID(self, list_type_id_ls, analysis_method_id_ls):
		"""
		2008-08-29
			deal with separator (list_type_id=-1) in list_type_id_ls
		"""
		sys.stderr.write("Orderinig list type id and analysis_method id ... ")
		list_type_id_analysis_method_id_ls = []
		list_type_id_analysis_method_id2index = {}
		list_type_analysis_method_label_ls = []
		no_of_separators = 0
		for list_type_id in list_type_id_ls:
			if list_type_id==-1:	#separator
				no_of_separators += 1
				tup = (-no_of_separators,-1)
				list_type_id_analysis_method_id2index[tup] = len(list_type_id_analysis_method_id_ls)
				list_type_id_analysis_method_id_ls.append(tup)
				list_type_analysis_method_label_ls.append('')
				continue
			list_type_short_name = GeneListType.get(list_type_id).short_name
			for analysis_method_id in analysis_method_id_ls:
				analysis_method_short_name = AnalysisMethod.get(analysis_method_id).short_name
				tup = (list_type_id, analysis_method_id)
				list_type_id_analysis_method_id2index[tup] = len(list_type_id_analysis_method_id_ls)
				list_type_id_analysis_method_id_ls.append(tup)
				list_type_analysis_method_label_ls.append('%s_%s_%s'%(analysis_method_short_name, list_type_short_name, list_type_id))
		return_data = PassingData()
		return_data.list_type_id_analysis_method_id_ls = list_type_id_analysis_method_id_ls
		return_data.list_type_id_analysis_method_id2index = list_type_id_analysis_method_id2index
		return_data.list_type_analysis_method_label_ls = list_type_analysis_method_label_ls
		sys.stderr.write("Done.\n")
		return return_data
    def orderListTypeAnalysisMethodID(self, list_type_id_ls,
                                      analysis_method_id_ls):
        """
		2008-08-29
			deal with separator (list_type_id=-1) in list_type_id_ls
		"""
        sys.stderr.write("Orderinig list type id and analysis_method id ... ")
        list_type_id_analysis_method_id_ls = []
        list_type_id_analysis_method_id2index = {}
        list_type_analysis_method_label_ls = []
        no_of_separators = 0
        for list_type_id in list_type_id_ls:
            if list_type_id == -1:  #separator
                no_of_separators += 1
                tup = (-no_of_separators, -1)
                list_type_id_analysis_method_id2index[tup] = len(
                    list_type_id_analysis_method_id_ls)
                list_type_id_analysis_method_id_ls.append(tup)
                list_type_analysis_method_label_ls.append('')
                continue
            list_type_short_name = GeneListType.get(list_type_id).short_name
            for analysis_method_id in analysis_method_id_ls:
                analysis_method_short_name = AnalysisMethod.get(
                    analysis_method_id).short_name
                tup = (list_type_id, analysis_method_id)
                list_type_id_analysis_method_id2index[tup] = len(
                    list_type_id_analysis_method_id_ls)
                list_type_id_analysis_method_id_ls.append(tup)
                list_type_analysis_method_label_ls.append(
                    '%s_%s_%s' % (analysis_method_short_name,
                                  list_type_short_name, list_type_id))
        return_data = PassingData()
        return_data.list_type_id_analysis_method_id_ls = list_type_id_analysis_method_id_ls
        return_data.list_type_id_analysis_method_id2index = list_type_id_analysis_method_id2index
        return_data.list_type_analysis_method_label_ls = list_type_analysis_method_label_ls
        sys.stderr.write("Done.\n")
        return return_data
Пример #3
0
	def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False):
		"""
		2009-10-18
			If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID.
		2009-2-4
			use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set
		2008-01-08
			add option skip_1st_line
			stop using csv.reader, use raw file handler instead
			figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column.
		2008-12-11
			more filtering:
				1. strip the original_name
				2. pick alphanumeric characters out of original_name
			if GeneListType is already in db. check if GeneList has this gene already or not.
		2008-11-20
			use figureOutDelimiter() to get delimiter automatically
		2008-07-15
			if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry.
		2008-07-15
			use gene_id2original_name to avoid redundancy in gene list
		"""
		import csv, sys, os
		session = db.session
		delimiter=figureOutDelimiter(input_fname)
		inf = open(input_fname)	#2008-11-20
		if skip_1st_line:
			inf.next()	#skips the 1st line
		counter = 0
		success_counter = 0
		gene_id2original_name = {}	#to avoid redundancy in gene list
		for line in inf:
			if line=='\n':	#skip empty lines
				continue
			row = line.split(delimiter)
			original_name = row[0].strip()	#2008-12-11 remove spaces/tabs in the beginning/end
			all_number_p_search_result = self.all_number_p.search(original_name)
			if all_number_p_search_result:	# 2009-10-18 original_name is full of numbers. a legitimate Gene ID.
				ecotypeid = int(all_number_p_search_result.group(0))
				gene_id_set = set([ecotypeid])
			else:
				gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set)
			
			if gene_id_set==None:
				sys.stderr.write("Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"%(original_name))
			elif len(gene_id_set)==1:
				gene_id = list(gene_id_set)[0]
				if gene_id not in gene_id2original_name:
					gene_id2original_name[gene_id] = original_name
				success_counter += 1
			elif len(gene_id_set)>1:
				sys.stderr.write("Too many gene_ids for %s: %s.\n"%(original_name, gene_id_set))
			elif len(gene_id_set)==0:
				sys.stderr.write("Linking to gene id failed for %s. gene_id_set is empty.\n"%(original_name))
			else:
				sys.stderr.write("not supposed to happen: original_name=%s, gene_id_set=%s\n."%(original_name, gene_id_set))
			counter += 1
		del inf
		
		if list_type_name:	#if the short name is given, forget about list_type_id
			glt = GeneListType.query.filter_by(short_name=list_type_name).first()	#try search the db first.
			if not glt:
				glt = GeneListType(short_name=list_type_name)
				session.save(glt)
				session.flush()
		else:	#use the list_type_id to get it
			glt = GeneListType.get(list_type_id)
		glt.original_filename = input_fname	#save the filename
		session.save_or_update(glt)
		
		for gene_id, original_name in gene_id2original_name.iteritems():
			if glt.id:	#2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not.
				rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(list_type_id=glt.id)
				if rows.count()>0:
					sys.stderr.write("Gene: %s (%s) already with list type %s.\n"%(gene_id, original_name, glt.short_name))
					continue
			gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name)
			session.save(gl)
		sys.stderr.write("%s/%s linked successfully.\n"%(success_counter, counter))
Пример #4
0
	def generate_params(cls, param_obj, min_no_of_genes=10):
		"""
		2009-1-11
			handle param_obj.no_check_gene_list
				if it's there and true, list_type_id_ls = param_obj.list_type_id_ls. no db check to make sure each gene list has enough genes in it.
				added to let PickCandidateGenesIntoResultsGene.py deal with list_type_id=0 (all genes)
		2008-11-13
			add "results_type==3" same as "results_type==1"
			
			if results_type is not supported, report and return []
		2008-11-08
			become a classmethod
		2008-10-26
			restrict results via param_obj.analysis_method_id_ls  and param_obj.phenotype_method_id_ls
		2008-10-10
			depends on results_type, decide which ( ResultsMethod or ResultsByGene) to get data
			also which (CandidateGeneRankSumTestResult or CandidateGeneRankSumTestResultMethod) to store results
		2008-09-26
			deal with the situation that list_type_id_ls is given.
		2008-09-16
			modify it to get result ids from ResultsByGene
		2008-09-10
			add results_method filtering by analysis_method_id
		2008-08-19
			add call_method_id
		2008-08-15
			stop filtering if CandidateGeneRankSumTestResult has (results_method_id, list_type_id) combo
		2008-07-24
			only association results (results_method_type_id=1)
			only candidate gene lists with >min_no_of_genes genes
			skip ones that been done
		"""
		sys.stderr.write("Generating parameters ...")
		i = 0
		block_size = 5000
		if param_obj.results_type==1 or  param_obj.results_type==3:	#1 and 3 are same ResultsMethod class
			query = ResultsMethod.query
			if param_obj.call_method_id!=0:
				query = query.filter_by(call_method_id=param_obj.call_method_id)
			if hasattr(param_obj, 'analysis_method_id_ls') and param_obj.analysis_method_id_ls:
				query = query.filter(ResultsMethod.analysis_method_id.in_(param_obj.analysis_method_id_ls))
			if hasattr(param_obj, 'phenotype_method_id_ls') and param_obj.phenotype_method_id_ls:
				query = query.filter(ResultsMethod.phenotype_method_id.in_(param_obj.phenotype_method_id_ls))
		elif param_obj.results_type==2:
			query = ResultsByGene.query
			if param_obj.call_method_id!=0:
				query = query.filter(ResultsByGene.results_method.has(call_method_id=param_obj.call_method_id))
			if hasattr(param_obj, 'analysis_method_id') and param_obj.analysis_method_id!=0 and param_obj.analysis_method_id is not None:
				query = query.filter(ResultsByGene.results_method.has(analysis_method_id=param_obj.analysis_method_id))
			if param_obj.analysis_method_id_ls:
				query = query.filter(ResultsByGene.results_method.has(ResultsMethod.analysis_method_id.in_(param_obj.analysis_method_id_ls)))
			if hasattr(param_obj, 'phenotype_method_id_ls') and param_obj.phenotype_method_id_ls:
				query = query.filter(ResultsByGene.results_method.has(ResultsMethod.phenotype_method_id.in_(param_obj.phenotype_method_id_ls)))
		else:
			sys.stderr.write("results_type %s not supported.\n"%results_type)
			return []
		
		rows = query.offset(i).limit(block_size)
		results_method_id_ls = []
		while rows.count()!=0:
			for row in rows:
				results_method_id_ls.append(row.id)
				i += 1
			rows = query.offset(i).limit(block_size)
		
		sys.stderr.write("%s results. "%(len(results_method_id_ls)))
		
		#if self.debug:	#2008-10-25 temporary testing
		#	results_method_id_ls = [2095, 2079]
		
		if getattr(param_obj, 'no_check_gene_list', None) and getattr(param_obj, 'list_type_id_ls', None):
			list_type_id_ls = param_obj.list_type_id_ls
		else:
			list_type_id_ls = []
			if getattr(param_obj, 'list_type_id_ls', None):	#if list_type_id_ls is given, check whether each one exists in db and has minimum number of genes.
				for list_type_id in param_obj.list_type_id_ls:
					glt = GeneListType.get(list_type_id)
					if glt and len(glt.gene_list)>=min_no_of_genes:
						list_type_id_ls.append(list_type_id)
			else:
				i = 0
				rows = GeneListType.query.offset(i).limit(block_size)
				while rows.count()!=0:
					for row in rows:
						if len(row.gene_list)>=min_no_of_genes:
							list_type_id_ls.append(row.id)
						i += 1
					rows = GeneListType.query.offset(i).limit(block_size)
		sys.stderr.write("%s candidate gene lists. "%(len(list_type_id_ls)))
		
		rm_id_lt_id_set = Set()
		"""
		i = 0
		rows = CandidateGeneRankSumTestResult.query.offset(i).limit(block_size)
		while rows.count()!=0:
			for row in rows:
				rm_id_lt_id_set.add((row.results_method_id, row.list_type_id))
				i += 1
			rows = CandidateGeneRankSumTestResult.query.offset(i).limit(block_size)
		sys.stderr.write("%s candidate gene rank sum test results. "%(len(rm_id_lt_id_set)))
		"""
		
		params_ls = []
		for results_method_id in results_method_id_ls:
			for list_type_id in list_type_id_ls:
				rm_id_lt_id = (results_method_id, list_type_id)
				if rm_id_lt_id not in rm_id_lt_id_set:
					params_ls.append(rm_id_lt_id)
		sys.stderr.write(" %s params generated.\n"%(len(params_ls)))
		return params_ls
Пример #5
0
	def generate_params(cls, param_obj, min_no_of_genes=10):
		"""
		2009-1-11
			handle param_obj.no_check_gene_list
				if it's there and true, list_type_id_ls = param_obj.list_type_id_ls. no db check to make sure each gene list has enough genes in it.
				added to let PickCandidateGenesIntoResultsGene.py deal with list_type_id=0 (all genes)
		2008-11-13
			add "results_type==3" same as "results_type==1"
			
			if results_type is not supported, report and return []
		2008-11-08
			become a classmethod
		2008-10-26
			restrict results via param_obj.analysis_method_id_ls  and param_obj.phenotype_method_id_ls
		2008-10-10
			depends on results_type, decide which ( ResultsMethod or ResultsByGene) to get data
			also which (CandidateGeneRankSumTestResult or CandidateGeneRankSumTestResultMethod) to store results
		2008-09-26
			deal with the situation that list_type_id_ls is given.
		2008-09-16
			modify it to get result ids from ResultsByGene
		2008-09-10
			add results_method filtering by analysis_method_id
		2008-08-19
			add call_method_id
		2008-08-15
			stop filtering if CandidateGeneRankSumTestResult has (results_method_id, list_type_id) combo
		2008-07-24
			only association results (results_method_type_id=1)
			only candidate gene lists with >min_no_of_genes genes
			skip ones that been done
		"""
		sys.stderr.write("Generating parameters ...")
		i = 0
		block_size = 5000
		if param_obj.results_type==1 or  param_obj.results_type==3:	#1 and 3 are same ResultsMethod class
			query = ResultsMethod.query
			if param_obj.call_method_id!=0:
				query = query.filter_by(call_method_id=param_obj.call_method_id)
			if hasattr(param_obj, 'analysis_method_id_ls') and param_obj.analysis_method_id_ls:
				query = query.filter(ResultsMethod.analysis_method_id.in_(param_obj.analysis_method_id_ls))
			if hasattr(param_obj, 'phenotype_method_id_ls') and param_obj.phenotype_method_id_ls:
				query = query.filter(ResultsMethod.phenotype_method_id.in_(param_obj.phenotype_method_id_ls))
		elif param_obj.results_type==2:
			query = ResultsByGene.query
			if param_obj.call_method_id!=0:
				query = query.filter(ResultsByGene.results_method.has(call_method_id=param_obj.call_method_id))
			if hasattr(param_obj, 'analysis_method_id') and param_obj.analysis_method_id!=0 and param_obj.analysis_method_id is not None:
				query = query.filter(ResultsByGene.results_method.has(analysis_method_id=param_obj.analysis_method_id))
			if param_obj.analysis_method_id_ls:
				query = query.filter(ResultsByGene.results_method.has(ResultsMethod.analysis_method_id.in_(param_obj.analysis_method_id_ls)))
			if hasattr(param_obj, 'phenotype_method_id_ls') and param_obj.phenotype_method_id_ls:
				query = query.filter(ResultsByGene.results_method.has(ResultsMethod.phenotype_method_id.in_(param_obj.phenotype_method_id_ls)))
		else:
			sys.stderr.write("results_type %s not supported.\n"%results_type)
			return []
		
		rows = query.offset(i).limit(block_size)
		results_method_id_ls = []
		while rows.count()!=0:
			for row in rows:
				results_method_id_ls.append(row.id)
				i += 1
			rows = query.offset(i).limit(block_size)
		
		sys.stderr.write("%s results. "%(len(results_method_id_ls)))
		
		#if self.debug:	#2008-10-25 temporary testing
		#	results_method_id_ls = [2095, 2079]
		
		if getattr(param_obj, 'no_check_gene_list', None) and getattr(param_obj, 'list_type_id_ls', None):
			list_type_id_ls = param_obj.list_type_id_ls
		else:
			list_type_id_ls = []
			if getattr(param_obj, 'list_type_id_ls', None):	#if list_type_id_ls is given, check whether each one exists in db and has minimum number of genes.
				for list_type_id in param_obj.list_type_id_ls:
					glt = GeneListType.get(list_type_id)
					if glt and len(glt.gene_list)>=min_no_of_genes:
						list_type_id_ls.append(list_type_id)
			else:
				i = 0
				rows = GeneListType.query.offset(i).limit(block_size)
				while rows.count()!=0:
					for row in rows:
						if len(row.gene_list)>=min_no_of_genes:
							list_type_id_ls.append(row.id)
						i += 1
					rows = GeneListType.query.offset(i).limit(block_size)
		sys.stderr.write("%s candidate gene lists. "%(len(list_type_id_ls)))
		
		rm_id_lt_id_set = Set()
		"""
		i = 0
		rows = CandidateGeneRankSumTestResult.query.offset(i).limit(block_size)
		while rows.count()!=0:
			for row in rows:
				rm_id_lt_id_set.add((row.results_method_id, row.list_type_id))
				i += 1
			rows = CandidateGeneRankSumTestResult.query.offset(i).limit(block_size)
		sys.stderr.write("%s candidate gene rank sum test results. "%(len(rm_id_lt_id_set)))
		"""
		
		params_ls = []
		for results_method_id in results_method_id_ls:
			for list_type_id in list_type_id_ls:
				rm_id_lt_id = (results_method_id, list_type_id)
				if rm_id_lt_id not in rm_id_lt_id_set:
					params_ls.append(rm_id_lt_id)
		sys.stderr.write(" %s params generated.\n"%(len(params_ls)))
		return params_ls
Пример #6
0
    def putGeneListIntoDb(self,
                          input_fname,
                          list_type_id,
                          list_type_name,
                          gene_symbol2gene_id_set,
                          db,
                          skip_1st_line=False):
        """
		2009-10-18
			If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID.
		2009-2-4
			use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set
		2008-01-08
			add option skip_1st_line
			stop using csv.reader, use raw file handler instead
			figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column.
		2008-12-11
			more filtering:
				1. strip the original_name
				2. pick alphanumeric characters out of original_name
			if GeneListType is already in db. check if GeneList has this gene already or not.
		2008-11-20
			use figureOutDelimiter() to get delimiter automatically
		2008-07-15
			if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry.
		2008-07-15
			use gene_id2original_name to avoid redundancy in gene list
		"""
        import csv, sys, os
        session = db.session
        delimiter = figureOutDelimiter(input_fname)
        inf = open(input_fname)  #2008-11-20
        if skip_1st_line:
            inf.next()  #skips the 1st line
        counter = 0
        success_counter = 0
        gene_id2original_name = {}  #to avoid redundancy in gene list
        for line in inf:
            if line == '\n':  #skip empty lines
                continue
            row = line.split(delimiter)
            original_name = row[0].strip(
            )  #2008-12-11 remove spaces/tabs in the beginning/end
            all_number_p_search_result = self.all_number_p.search(
                original_name)
            if all_number_p_search_result:  # 2009-10-18 original_name is full of numbers. a legitimate Gene ID.
                ecotypeid = int(all_number_p_search_result.group(0))
                gene_id_set = set([ecotypeid])
            else:
                gene_id_set = getGeneIDSetGivenAccVer(original_name,
                                                      gene_symbol2gene_id_set)

            if gene_id_set == None:
                sys.stderr.write(
                    "Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"
                    % (original_name))
            elif len(gene_id_set) == 1:
                gene_id = list(gene_id_set)[0]
                if gene_id not in gene_id2original_name:
                    gene_id2original_name[gene_id] = original_name
                success_counter += 1
            elif len(gene_id_set) > 1:
                sys.stderr.write("Too many gene_ids for %s: %s.\n" %
                                 (original_name, gene_id_set))
            elif len(gene_id_set) == 0:
                sys.stderr.write(
                    "Linking to gene id failed for %s. gene_id_set is empty.\n"
                    % (original_name))
            else:
                sys.stderr.write(
                    "not supposed to happen: original_name=%s, gene_id_set=%s\n."
                    % (original_name, gene_id_set))
            counter += 1
        del inf

        if list_type_name:  #if the short name is given, forget about list_type_id
            glt = GeneListType.query.filter_by(
                short_name=list_type_name).first()  #try search the db first.
            if not glt:
                glt = GeneListType(short_name=list_type_name)
                session.save(glt)
                session.flush()
        else:  #use the list_type_id to get it
            glt = GeneListType.get(list_type_id)
        glt.original_filename = input_fname  #save the filename
        session.save_or_update(glt)

        for gene_id, original_name in gene_id2original_name.iteritems():
            if glt.id:  #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not.
                rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(
                    list_type_id=glt.id)
                if rows.count() > 0:
                    sys.stderr.write(
                        "Gene: %s (%s) already with list type %s.\n" %
                        (gene_id, original_name, glt.short_name))
                    continue
            gl = GeneList(gene_id=gene_id,
                          list_type=glt,
                          original_name=original_name)
            session.save(gl)
        sys.stderr.write("%s/%s linked successfully.\n" %
                         (success_counter, counter))