def orderListTypeAnalysisMethodID(self, list_type_id_ls, analysis_method_id_ls): """ 2008-08-29 deal with separator (list_type_id=-1) in list_type_id_ls """ sys.stderr.write("Orderinig list type id and analysis_method id ... ") list_type_id_analysis_method_id_ls = [] list_type_id_analysis_method_id2index = {} list_type_analysis_method_label_ls = [] no_of_separators = 0 for list_type_id in list_type_id_ls: if list_type_id==-1: #separator no_of_separators += 1 tup = (-no_of_separators,-1) list_type_id_analysis_method_id2index[tup] = len(list_type_id_analysis_method_id_ls) list_type_id_analysis_method_id_ls.append(tup) list_type_analysis_method_label_ls.append('') continue list_type_short_name = GeneListType.get(list_type_id).short_name for analysis_method_id in analysis_method_id_ls: analysis_method_short_name = AnalysisMethod.get(analysis_method_id).short_name tup = (list_type_id, analysis_method_id) list_type_id_analysis_method_id2index[tup] = len(list_type_id_analysis_method_id_ls) list_type_id_analysis_method_id_ls.append(tup) list_type_analysis_method_label_ls.append('%s_%s_%s'%(analysis_method_short_name, list_type_short_name, list_type_id)) return_data = PassingData() return_data.list_type_id_analysis_method_id_ls = list_type_id_analysis_method_id_ls return_data.list_type_id_analysis_method_id2index = list_type_id_analysis_method_id2index return_data.list_type_analysis_method_label_ls = list_type_analysis_method_label_ls sys.stderr.write("Done.\n") return return_data
def orderListTypeAnalysisMethodID(self, list_type_id_ls, analysis_method_id_ls): """ 2008-08-29 deal with separator (list_type_id=-1) in list_type_id_ls """ sys.stderr.write("Orderinig list type id and analysis_method id ... ") list_type_id_analysis_method_id_ls = [] list_type_id_analysis_method_id2index = {} list_type_analysis_method_label_ls = [] no_of_separators = 0 for list_type_id in list_type_id_ls: if list_type_id == -1: #separator no_of_separators += 1 tup = (-no_of_separators, -1) list_type_id_analysis_method_id2index[tup] = len( list_type_id_analysis_method_id_ls) list_type_id_analysis_method_id_ls.append(tup) list_type_analysis_method_label_ls.append('') continue list_type_short_name = GeneListType.get(list_type_id).short_name for analysis_method_id in analysis_method_id_ls: analysis_method_short_name = AnalysisMethod.get( analysis_method_id).short_name tup = (list_type_id, analysis_method_id) list_type_id_analysis_method_id2index[tup] = len( list_type_id_analysis_method_id_ls) list_type_id_analysis_method_id_ls.append(tup) list_type_analysis_method_label_ls.append( '%s_%s_%s' % (analysis_method_short_name, list_type_short_name, list_type_id)) return_data = PassingData() return_data.list_type_id_analysis_method_id_ls = list_type_id_analysis_method_id_ls return_data.list_type_id_analysis_method_id2index = list_type_id_analysis_method_id2index return_data.list_type_analysis_method_label_ls = list_type_analysis_method_label_ls sys.stderr.write("Done.\n") return return_data
def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False): """ 2009-10-18 If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID. 2009-2-4 use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set 2008-01-08 add option skip_1st_line stop using csv.reader, use raw file handler instead figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column. 2008-12-11 more filtering: 1. strip the original_name 2. pick alphanumeric characters out of original_name if GeneListType is already in db. check if GeneList has this gene already or not. 2008-11-20 use figureOutDelimiter() to get delimiter automatically 2008-07-15 if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry. 2008-07-15 use gene_id2original_name to avoid redundancy in gene list """ import csv, sys, os session = db.session delimiter=figureOutDelimiter(input_fname) inf = open(input_fname) #2008-11-20 if skip_1st_line: inf.next() #skips the 1st line counter = 0 success_counter = 0 gene_id2original_name = {} #to avoid redundancy in gene list for line in inf: if line=='\n': #skip empty lines continue row = line.split(delimiter) original_name = row[0].strip() #2008-12-11 remove spaces/tabs in the beginning/end all_number_p_search_result = self.all_number_p.search(original_name) if all_number_p_search_result: # 2009-10-18 original_name is full of numbers. a legitimate Gene ID. ecotypeid = int(all_number_p_search_result.group(0)) gene_id_set = set([ecotypeid]) else: gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set) if gene_id_set==None: sys.stderr.write("Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"%(original_name)) elif len(gene_id_set)==1: gene_id = list(gene_id_set)[0] if gene_id not in gene_id2original_name: gene_id2original_name[gene_id] = original_name success_counter += 1 elif len(gene_id_set)>1: sys.stderr.write("Too many gene_ids for %s: %s.\n"%(original_name, gene_id_set)) elif len(gene_id_set)==0: sys.stderr.write("Linking to gene id failed for %s. gene_id_set is empty.\n"%(original_name)) else: sys.stderr.write("not supposed to happen: original_name=%s, gene_id_set=%s\n."%(original_name, gene_id_set)) counter += 1 del inf if list_type_name: #if the short name is given, forget about list_type_id glt = GeneListType.query.filter_by(short_name=list_type_name).first() #try search the db first. if not glt: glt = GeneListType(short_name=list_type_name) session.save(glt) session.flush() else: #use the list_type_id to get it glt = GeneListType.get(list_type_id) glt.original_filename = input_fname #save the filename session.save_or_update(glt) for gene_id, original_name in gene_id2original_name.iteritems(): if glt.id: #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not. rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(list_type_id=glt.id) if rows.count()>0: sys.stderr.write("Gene: %s (%s) already with list type %s.\n"%(gene_id, original_name, glt.short_name)) continue gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name) session.save(gl) sys.stderr.write("%s/%s linked successfully.\n"%(success_counter, counter))
def generate_params(cls, param_obj, min_no_of_genes=10): """ 2009-1-11 handle param_obj.no_check_gene_list if it's there and true, list_type_id_ls = param_obj.list_type_id_ls. no db check to make sure each gene list has enough genes in it. added to let PickCandidateGenesIntoResultsGene.py deal with list_type_id=0 (all genes) 2008-11-13 add "results_type==3" same as "results_type==1" if results_type is not supported, report and return [] 2008-11-08 become a classmethod 2008-10-26 restrict results via param_obj.analysis_method_id_ls and param_obj.phenotype_method_id_ls 2008-10-10 depends on results_type, decide which ( ResultsMethod or ResultsByGene) to get data also which (CandidateGeneRankSumTestResult or CandidateGeneRankSumTestResultMethod) to store results 2008-09-26 deal with the situation that list_type_id_ls is given. 2008-09-16 modify it to get result ids from ResultsByGene 2008-09-10 add results_method filtering by analysis_method_id 2008-08-19 add call_method_id 2008-08-15 stop filtering if CandidateGeneRankSumTestResult has (results_method_id, list_type_id) combo 2008-07-24 only association results (results_method_type_id=1) only candidate gene lists with >min_no_of_genes genes skip ones that been done """ sys.stderr.write("Generating parameters ...") i = 0 block_size = 5000 if param_obj.results_type==1 or param_obj.results_type==3: #1 and 3 are same ResultsMethod class query = ResultsMethod.query if param_obj.call_method_id!=0: query = query.filter_by(call_method_id=param_obj.call_method_id) if hasattr(param_obj, 'analysis_method_id_ls') and param_obj.analysis_method_id_ls: query = query.filter(ResultsMethod.analysis_method_id.in_(param_obj.analysis_method_id_ls)) if hasattr(param_obj, 'phenotype_method_id_ls') and param_obj.phenotype_method_id_ls: query = query.filter(ResultsMethod.phenotype_method_id.in_(param_obj.phenotype_method_id_ls)) elif param_obj.results_type==2: query = ResultsByGene.query if param_obj.call_method_id!=0: query = query.filter(ResultsByGene.results_method.has(call_method_id=param_obj.call_method_id)) if hasattr(param_obj, 'analysis_method_id') and param_obj.analysis_method_id!=0 and param_obj.analysis_method_id is not None: query = query.filter(ResultsByGene.results_method.has(analysis_method_id=param_obj.analysis_method_id)) if param_obj.analysis_method_id_ls: query = query.filter(ResultsByGene.results_method.has(ResultsMethod.analysis_method_id.in_(param_obj.analysis_method_id_ls))) if hasattr(param_obj, 'phenotype_method_id_ls') and param_obj.phenotype_method_id_ls: query = query.filter(ResultsByGene.results_method.has(ResultsMethod.phenotype_method_id.in_(param_obj.phenotype_method_id_ls))) else: sys.stderr.write("results_type %s not supported.\n"%results_type) return [] rows = query.offset(i).limit(block_size) results_method_id_ls = [] while rows.count()!=0: for row in rows: results_method_id_ls.append(row.id) i += 1 rows = query.offset(i).limit(block_size) sys.stderr.write("%s results. "%(len(results_method_id_ls))) #if self.debug: #2008-10-25 temporary testing # results_method_id_ls = [2095, 2079] if getattr(param_obj, 'no_check_gene_list', None) and getattr(param_obj, 'list_type_id_ls', None): list_type_id_ls = param_obj.list_type_id_ls else: list_type_id_ls = [] if getattr(param_obj, 'list_type_id_ls', None): #if list_type_id_ls is given, check whether each one exists in db and has minimum number of genes. for list_type_id in param_obj.list_type_id_ls: glt = GeneListType.get(list_type_id) if glt and len(glt.gene_list)>=min_no_of_genes: list_type_id_ls.append(list_type_id) else: i = 0 rows = GeneListType.query.offset(i).limit(block_size) while rows.count()!=0: for row in rows: if len(row.gene_list)>=min_no_of_genes: list_type_id_ls.append(row.id) i += 1 rows = GeneListType.query.offset(i).limit(block_size) sys.stderr.write("%s candidate gene lists. "%(len(list_type_id_ls))) rm_id_lt_id_set = Set() """ i = 0 rows = CandidateGeneRankSumTestResult.query.offset(i).limit(block_size) while rows.count()!=0: for row in rows: rm_id_lt_id_set.add((row.results_method_id, row.list_type_id)) i += 1 rows = CandidateGeneRankSumTestResult.query.offset(i).limit(block_size) sys.stderr.write("%s candidate gene rank sum test results. "%(len(rm_id_lt_id_set))) """ params_ls = [] for results_method_id in results_method_id_ls: for list_type_id in list_type_id_ls: rm_id_lt_id = (results_method_id, list_type_id) if rm_id_lt_id not in rm_id_lt_id_set: params_ls.append(rm_id_lt_id) sys.stderr.write(" %s params generated.\n"%(len(params_ls))) return params_ls
def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False): """ 2009-10-18 If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID. 2009-2-4 use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set 2008-01-08 add option skip_1st_line stop using csv.reader, use raw file handler instead figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column. 2008-12-11 more filtering: 1. strip the original_name 2. pick alphanumeric characters out of original_name if GeneListType is already in db. check if GeneList has this gene already or not. 2008-11-20 use figureOutDelimiter() to get delimiter automatically 2008-07-15 if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry. 2008-07-15 use gene_id2original_name to avoid redundancy in gene list """ import csv, sys, os session = db.session delimiter = figureOutDelimiter(input_fname) inf = open(input_fname) #2008-11-20 if skip_1st_line: inf.next() #skips the 1st line counter = 0 success_counter = 0 gene_id2original_name = {} #to avoid redundancy in gene list for line in inf: if line == '\n': #skip empty lines continue row = line.split(delimiter) original_name = row[0].strip( ) #2008-12-11 remove spaces/tabs in the beginning/end all_number_p_search_result = self.all_number_p.search( original_name) if all_number_p_search_result: # 2009-10-18 original_name is full of numbers. a legitimate Gene ID. ecotypeid = int(all_number_p_search_result.group(0)) gene_id_set = set([ecotypeid]) else: gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set) if gene_id_set == None: sys.stderr.write( "Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n" % (original_name)) elif len(gene_id_set) == 1: gene_id = list(gene_id_set)[0] if gene_id not in gene_id2original_name: gene_id2original_name[gene_id] = original_name success_counter += 1 elif len(gene_id_set) > 1: sys.stderr.write("Too many gene_ids for %s: %s.\n" % (original_name, gene_id_set)) elif len(gene_id_set) == 0: sys.stderr.write( "Linking to gene id failed for %s. gene_id_set is empty.\n" % (original_name)) else: sys.stderr.write( "not supposed to happen: original_name=%s, gene_id_set=%s\n." % (original_name, gene_id_set)) counter += 1 del inf if list_type_name: #if the short name is given, forget about list_type_id glt = GeneListType.query.filter_by( short_name=list_type_name).first() #try search the db first. if not glt: glt = GeneListType(short_name=list_type_name) session.save(glt) session.flush() else: #use the list_type_id to get it glt = GeneListType.get(list_type_id) glt.original_filename = input_fname #save the filename session.save_or_update(glt) for gene_id, original_name in gene_id2original_name.iteritems(): if glt.id: #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not. rows = GeneList.query.filter_by(gene_id=gene_id).filter_by( list_type_id=glt.id) if rows.count() > 0: sys.stderr.write( "Gene: %s (%s) already with list type %s.\n" % (gene_id, original_name, glt.short_name)) continue gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name) session.save(gl) sys.stderr.write("%s/%s linked successfully.\n" % (success_counter, counter))