def run(self): """ 2008-06-02 """ if self.debug: import pdb pdb.set_trace() if self.row_matching_by_which_value == 0: snpData1 = SNPData(input_fname=self.input_fname1, turn_into_array=1, ignore_2nd_column=1) else: snpData1 = SNPData(input_fname=self.input_fname1, turn_into_array=1) snpData2 = SNPData(input_fname=self.input_fname2, turn_into_array=1) if self.row_matching_by_which_value == 1 or self.row_matching_by_which_value == 2: row_matching_by_which_value = self.row_matching_by_which_value - 1 else: row_matching_by_which_value = None twoSNPData = TwoSNPData( SNPData1=snpData1, SNPData2=snpData2, debug=self.debug, row_matching_by_which_value=row_matching_by_which_value) newSnpData = twoSNPData.order2ndSNPDataRowsSameAs1stSNPData() newSnpData.tofile(self.output_fname)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() snpData = SNPData(input_fname=self.inputFname, turn_into_array=1, ignore_2nd_column=1) snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([])) if self.min_MAF and self.min_MAF > 0: snpData = SNPData.removeColsByMAF(snpData, min_MAF=self.min_MAF, NA_set=set([])) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.writer.makeupHeaderFromSampleIDList( sampleIDList=snpData.row_id_ls) self.writer.writeMetaAndHeader() counter = 0 for j in xrange(len(snpData.col_id_ls)): snp_id = snpData.col_id_ls[j] chromosome, start = snp_id.split('_')[:2] genotype_ls = snpData.data_matrix[:, j] genotype_ls = utils.dict_map(number2di_nt, genotype_ls) genotype_ls_vcf = [] alleleNucleotide2Number = {} alleleNumber2Nucleotide = {} for genotype in genotype_ls: if genotype == 'NA': genotype_ls_vcf.append("./.") elif len(genotype) == 2: for allele in genotype: if allele not in alleleNucleotide2Number: alleleNumber = len(alleleNucleotide2Number) alleleNucleotide2Number[allele] = alleleNumber alleleNumber2Nucleotide[alleleNumber] = allele genotype_ls_vcf.append( "%s/%s" % (alleleNucleotide2Number[genotype[0]], alleleNucleotide2Number[genotype[1]])) else: genotype_ls_vcf.append("./.") refAllele = alleleNumber2Nucleotide[0] if 1 not in alleleNumber2Nucleotide: altAllele = refAllele else: altAllele = alleleNumber2Nucleotide[1] row = [ chromosome, start, ".", refAllele, altAllele, 999, 'PASS', "DP=100", "GT" ] + genotype_ls_vcf self.writer.writerow(row) counter += 1 sys.stderr.write(" %s records.\n" % (counter)) self.writer.close()
def readInData( cls, phenotype_fname, input_fname, eigen_vector_fname, phenotype_method_id_ls, test_type=1, report=0 ): """ 2010-2-25 call removeUnPhenotypedSNPData() to shrink the snp dataset by removing un-phenotyped ecotypes 2009-3-20 refactored out of run(), easy for MpiAssociation.py to call """ header, strain_acc_list, category_list, data_matrix = read_data(input_fname) snpData = SNPData( header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix ) header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( phenotype_fname, turn_into_integer=0 ) snpData = cls.removeUnPhenotypedSNPData( snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls ) newSnpData, allele2index_ls = snpData.convertSNPAllele2Index( report ) # 0 (NA) or -2 (untouched) is all converted to -2 as 0 is used to denote allele newSnpData.header = snpData.header data_matrix_phen = cls.get_phenotype_matrix_in_data_matrix_order( strain_acc_list, strain_acc_list_phen, data_matrix_phen ) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) if eigen_vector_fname: PC_data = cls.getPCFromFile(eigen_vector_fname) PC_matrix = PC_data.PC_matrix else: if test_type == 4: # eigen_vector_fname not given for this test_type. calcualte PCs. import pca_module T, P, explained_var = pca_module.PCA_svd(newSnpData.data_matrix, standardize=False) PC_matrix = T else: PC_matrix = None del snpData if phenotype_method_id_ls: which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls)) else: # if not available, take all phenotypes which_phenotype_ls = range(len(phenData.col_id_ls)) pdata = PassingData( snpData=newSnpData, phenData=phenData, PC_matrix=PC_matrix, which_phenotype_ls=which_phenotype_ls, phenotype_method_id_ls=phenotype_method_id_ls, ) return pdata
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData( phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def run(self): """ 2008-08-11 the database interface changed in variation.src.dbsnp 2008-05-06 """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() if self.debug: import pdb pdb.set_trace() db = DBSNP(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) session = db.session session.begin() #transaction = session.create_transaction() snps_name2possible_mappings, snps_name2snps_id = self.get_snps_name2possible_mappings( db) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_fname1) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ col_id2id=snps_name2snps_id, snps_table='dbsnp.snps') header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_fname2) snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix,\ snps_table='stock_250k.snps') twoSNPData = TwoSNPData384(SNPData1=snpData1, SNPData2=snpData2, curs=curs, user=self.user) readme = formReadmeObj(sys.argv, self.ad, README) session.save(readme) session.flush() twoSNPData.figureOutABMapping(session, readme, snps_name2possible_mappings) if self.commit: curs.execute("commit") session.commit() else: session.rollback()
def getPhenotypeData(cls, curs, phenotype_avg_table=None, phenotype_method_table=None, ecotype_table='stock.ecotype', get_raw_data=1): """ 2009-2-2 wrap up all other 3 methods """ phenotype_info = cls.get_phenotype_method_id_info(curs, phenotype_avg_table, phenotype_method_table) ecotype_id2index, ecotype_id_ls, ecotype_name_ls = cls.get_ecotype_id2info(curs, phenotype_avg_table, ecotype_table) data_matrix = cls.get_matrix(curs, phenotype_avg_table, ecotype_id2index, phenotype_info, get_raw_data) pheno_data = SNPData(col_id_ls=phenotype_info.phenotype_id_ls, row_id_ls=ecotype_id_ls, data_matrix=data_matrix) pheno_data.row_label_ls = ecotype_name_ls pheno_data.col_label_ls = phenotype_info.method_id_name_ls return pheno_data
def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() #database connection and etc db = self.db_250k session = db.session session.begin() delimiter = figureOutDelimiter(self.inputFname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.inputFname, delimiter=delimiter) if self.snp_id_type == 1: #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs. # but if col-id is already chr_pos, it's fine. new_header = header[:2] data_matrix_col_index_to_be_kept = [] for i in xrange(2, len(header)): snp_id = header[i] chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, ) if chr_pos is not None: data_matrix_col_index_to_be_kept.append(i - 2) new_header.append(chr_pos) # to remove no-db_id columns from data matrix data_matrix = numpy.array(data_matrix) data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept] header = new_header if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [ rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls ] snpsdata.writeRawSnpsDatasToFile(self.outputFname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def run(self): """ 2009-2-12 """ if self.debug: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() chr2CNV_probe_ls_pickle_fname = '/tmp/chr2CNV_probe_ls.pickle' if not os.path.isfile(chr2CNV_probe_ls_pickle_fname): chr2CNV_probe_ls = self.get_chr2CNV_probe_ls( curs, self.probes_table) picklef = open(chr2CNV_probe_ls_pickle_fname, 'w') cPickle.dump(chr2CNV_probe_ls, picklef, -1) del picklef else: picklef = open(chr2CNV_probe_ls_pickle_fname, 'r') chr2CNV_probe_ls = cPickle.load(picklef) del picklef snpData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1) probeData = self.get_probe_id2snp_id_ls(chr2CNV_probe_ls, snpData.col_id_ls) SNP2Col_allele = self.get_SNP2Col_allele(snpData) cnvIntensityData = SNPData(input_fname=self.cnv_input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float) cnvQCData = self.getCNVQCMatrix(probeData.probe_id2snp_id_ls, probeData.snp_id2tup, snpData, SNP2Col_allele, cnvIntensityData) plotdata_pickle_fname = '/tmp/CNV_plot_data.pickle' picklef = open(plotdata_pickle_fname, 'w') cPickle.dump(cnvQCData.plotData, picklef, -1) del picklef cnvQCData.mismatchData.tofile('%s_mismatch.tsv' % self.output_fname_prefix) cnvQCData.insertionData.tofile('%s_insertion.tsv' % self.output_fname_prefix) cnvQCData.deletionData.tofile('%s_deletion.tsv' % self.output_fname_prefix) cnvQCData.qcData.tofile('%s_qc.tsv' % self.output_fname_prefix)
def run(self): cnvIntensityData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float) probe_pos_ls = [] avg_intensity_ls = [] if self.run_type == 1: newDataMatrix = numpy.ones(cnvIntensityData.data_matrix.shape, numpy.int) for j in range(cnvIntensityData.data_matrix.shape[1]): probe_id = cnvIntensityData.col_id_ls[j] probe_id = probe_id.split('_') probe_id = map(int, probe_id) probe_pos_ls.append(probe_id[1]) avg_intensity_ls.append( numpy.sum(cnvIntensityData.data_matrix[:, j])) if self.run_type == 1: for i in range(cnvIntensityData.data_matrix.shape[0]): if cnvIntensityData.data_matrix[i][ j] <= self.max_del_intensity: newDataMatrix[i][j] = -1 if self.run_type == 1: newData = SNPData(row_id_ls=cnvIntensityData.row_id_ls, col_id_ls=cnvIntensityData.col_id_ls, data_matrix=newDataMatrix) newData.tofile(self.output_fname) elif self.run_type == 2: block_size = 1000 no_of_probes = len(probe_pos_ls) no_of_blocks = no_of_probes / block_size for i in range(no_of_blocks): if i * block_size > no_of_probes: break start_index = i * block_size end_index = min((i + 1) * block_size, no_of_probes) fname = '%s_%s_%s.png' % (self.output_fname, probe_pos_ls[start_index], probe_pos_ls[end_index]) pylab.clf() pylab.plot(probe_pos_ls[start_index:end_index], avg_intensity_ls[start_index:end_index], '.', markersize=4, alpha=0.4) pylab.xlabel('chromosome position') pylab.ylabel('sum intensity') pylab.savefig(fname, dpi=300)
def main(self): if self.debug: import pdb pdb.set_trace() if self.input_file_format == 1: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, turn_into_integer=0) snps_name_ls = header[2:] no_of_rows = len(strain_acc_list) no_of_samplings = int(math.ceil(self.coverage * no_of_rows / float(self.no_of_accessions_per_sampling))) if no_of_samplings > 1: imputed_matrix, new_snps_name_ls = self.samplingImpute( snps_name_ls, data_matrix, input_file_format=1, input_NA_char="0", lower_case_for_imputation=self.lower_case_for_imputation, npute_window_size=self.single_window_size, no_of_accessions_per_sampling=self.no_of_accessions_per_sampling, coverage=self.coverage, ) imputedData = YuSNPData( strain_acc_list=strain_acc_list, category_list=category_list, col_id_ls=snps_name_ls, data_matrix=imputed_matrix, ) imputedData.tofile(self.output_fname) else: self.outputHeader(self.output_fname, strain_acc_list, category_list) chr2no_of_snps = self.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() for chromosome in chr_ls: snpData = SNPData( inFile=self.input_fname, snps_name_ls=snps_name_ls, data_matrix=data_matrix, chromosome=chromosome, input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation, ) self.run(snpData) else: snpData = SNPData( inFile=self.input_fname, input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation, ) self.run(snpData)
def loadDataStructure(self, gene_annotation_picklef, min_MAF=0, min_distance=20000, \ list_type_id=None, snp_matrix_fname=None, snp_matrix_data_type=1): """ 2009-5-30 add argument snp_matrix_fname 2008-11-25 2008-10-01 wrap a few functions up, convenient for both run() and drawSNPRegion() """ db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) self.db = db snp_info = self.getSNPInfo(db) gene_annotation = self.dealWithGeneAnnotation(gene_annotation_picklef) if list_type_id: candidate_gene_list = self.getGeneList(list_type_id) candidate_gene_set = Set(candidate_gene_list) else: candidate_gene_set = Set() if snp_matrix_fname: if snp_matrix_data_type==3: matrix_data_type=float #2009-3-23 for CNV amplitude file else: matrix_data_type=int snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=matrix_data_type) #2008-12-05 fake a snp_info for findSNPsInRegion self.construct_chr_pos2index_forSNPData(snpData) else: snpData = None return_data = PassingData(gene_annotation=gene_annotation, snp_info=snp_info, \ candidate_gene_set=candidate_gene_set, snpData=snpData) return return_data
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session results_method_id_info = self.getResultsMethodIDInfo(db, self.call_method_id_ls, self.min_distance, self.get_closest, self.min_MAF) results_method_id2gene_set = self.getResultsMethodID2GeneSet(db, results_method_id_info, self.results_directory, self.max_rank) rdata = self.getDataMatrix(results_method_id2gene_set, results_method_id_info) header = ['', ''] + results_method_id_info.results_method_id_label_ls strain_acc_list = results_method_id_info.results_method_id_label_ls category_list = results_method_id_info.results_method_id_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, strain_acc_list,\ strain_acc_list, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def removeUnPhenotypedSNPData( clf, snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls ): """ 2010-2-25 remove un-phenotyped ecotypes from the SNP data in order to keep the snp dataset small """ sys.stderr.write("Removing un-phenotyped ecotypes from the SNP data ...") phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) if phenotype_method_id_ls: which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls)) else: # if not available, take all phenotypes which_phenotype_ls = range(len(phenData.col_id_ls)) phenotyped_ecotype_id_set = set() for i in range(len(phenData.row_id_ls)): ecotype_id = phenData.row_id_ls[i] keep_this_ecotype = False for col_index in which_phenotype_ls: if phenData.data_matrix[i][col_index] != "NA": # 2010-2-25 phenotype values are in raw string. keep_this_ecotype = True break if keep_this_ecotype: phenotyped_ecotype_id_set.add(ecotype_id) row_ids_to_be_kept = set() # 2010-2-21 no_of_ecotypes_in_total = len(snpData.row_id_ls) for row_id in snpData.row_id_ls: ecotype_id = row_id[0] # 1st column is ecotype_id, 2nd is array id if ecotype_id in phenotyped_ecotype_id_set: row_ids_to_be_kept.add(row_id) snpData = SNPData.keepRowsByRowID(snpData, row_ids_to_be_kept) no_of_removed = no_of_ecotypes_in_total - len(row_ids_to_be_kept) sys.stderr.write("%s removed. Done.\n" % (no_of_removed)) return snpData
def qcDataMatrixVSsnpData(self, pdata, snps_name2snps_id, snpData2, curs, session, readme): """ 2008-08-16 split from run() to enable one_by_one option """ #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=pdata.header, strain_acc_list=pdata.ecotype_id_ls, category_list=pdata.call_info_id_ls, data_matrix=pdata.data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: twoSNPData.save_col_wise(session, readme) row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) passingdata = PassingData() passingdata.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate passingdata.row_id12row_id2 = twoSNPData.row_id12row_id2 return passingdata
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def run(self): """ 2008-9-7 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report) if self.mapping_fname: #output allele_index2allele_ls self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname) newSnpData.tofile(self.output_fname)
def getPhenotypeDataInSNPDataOrder(cls, snpData): """ 2009-4-30 get data from all the phenotypes into one matrix (accession by phenotype) """ phenoData_inSNPDataOrder = getattr(model, "phenoData_inSNPDataOrder", None) if phenoData_inSNPDataOrder is None: phenoData = cls.getPhenotypeData() phenoData_inSNPDataOrder = SNPData( col_id_ls=phenoData.col_id_ls, strain_acc_list=snpData.row_id_ls, data_matrix=phenoData.data_matrix ) # row label is that of the SNP matrix phenoData_inSNPDataOrder.col_label_ls = phenoData.col_label_ls phenotype_row_id_ls = map( str, phenoData.row_id_ls ) # phenoData.row_id_ls is a list of integer ecotype ids, need to convert phenoData_inSNPDataOrder.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenotype_row_id_ls, phenoData_inSNPDataOrder.data_matrix ) model.phenoData_inSNPDataOrder = phenoData_inSNPDataOrder return phenoData_inSNPDataOrder
def getPhenotypeData(cls, curs, phenotype_avg_table=None, phenotype_method_table=None, ecotype_table='stock.ecotype', get_raw_data=1,\ getPublicPhenotype=False): """ 2012.9.28 add argument getPublicPhenotype 2009-2-2 wrap up all other 3 methods """ phenotype_info = cls.get_phenotype_method_id_info(curs, phenotype_avg_table=phenotype_avg_table, \ phenotype_method_table=phenotype_method_table, getPublicPhenotype=getPublicPhenotype) ecotype_id2index, ecotype_id_ls, ecotype_name_ls = cls.get_ecotype_id2info(curs, phenotype_avg_table=phenotype_avg_table,\ ecotype_table=ecotype_table, getPublicPhenotype=getPublicPhenotype) data_matrix = cls.get_matrix(curs, phenotype_avg_table, ecotype_id2index=ecotype_id2index, phenotype_info=phenotype_info, \ get_raw_data=get_raw_data, phenotype_method_table=phenotype_method_table,\ getPublicPhenotype=getPublicPhenotype) pheno_data = SNPData(col_id_ls=phenotype_info.phenotype_id_ls, row_id_ls=ecotype_id_ls, data_matrix=data_matrix) pheno_data.row_label_ls = ecotype_name_ls pheno_data.col_label_ls = phenotype_info.method_id_name_ls return pheno_data
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index1 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id1]))[0] phenotype_col_index2 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id2]))[0] x_ls = [] y_ls = [] for i in range(phenData.data_matrix.shape[0]): if not numpy.isnan(phenData.data_matrix[i][phenotype_col_index1]) and not numpy.isnan(phenData.data_matrix[i][phenotype_col_index2]): x_ls.append(phenData.data_matrix[i][phenotype_col_index1]) y_ls.append(phenData.data_matrix[i][phenotype_col_index2]) pylab.clf() pylab.title('Phenotype Contrast') pylab.plot(x_ls, y_ls, '.', alpha=0.6) pylab.grid(alpha=0.3) phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id1) phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id2) pylab.xlabel(phenotype_method1.short_name) pylab.ylabel(phenotype_method2.short_name) #draw diagonal line to show perfect correlation max_min_value = max(min(x_ls), min(y_ls)) min_max_value = min(max(x_ls), max(y_ls)) pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7) png_output_fname = '%s.png'%self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg'%self.output_fname_prefix)
def run(self): """ 2008-9-7 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.mapping_fname: #output allele_index2allele_ls self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname) newSnpData.tofile(self.output_fname)
def create_init_data(self): """ 2009-6-5 add argument ignore_het=1 to snpData_2010_149_384 & snpData_perlegen 2008-05-12 initial data loading on node 0 """ init_data = PassingData() init_data.snpData_250k = SNPData(input_fname=self.input_fname, turn_into_array=1) init_data.snpData_2010_149_384 = SNPData( input_fname=self.fname_2010_149_384, turn_into_array=1, ignore_2nd_column=1, ignore_het=1) init_data.snpData_perlegen = SNPData(input_fname=self.fname_perlegen, turn_into_array=1, ignore_2nd_column=1, ignore_het=1) param_d = self.generate_parameters(self.parameter_names) init_data.param_d = param_d return init_data
def main(self): if self.debug: import pdb pdb.set_trace() if self.input_file_format == 1: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, turn_into_integer=0) snps_name_ls = header[2:] no_of_rows = len(strain_acc_list) no_of_samplings = int( math.ceil(self.coverage * no_of_rows / float(self.no_of_accessions_per_sampling))) if no_of_samplings > 1: imputed_matrix, new_snps_name_ls = self.samplingImpute(snps_name_ls, data_matrix, input_file_format=1, \ input_NA_char='0', lower_case_for_imputation=self.lower_case_for_imputation,\ npute_window_size=self.single_window_size, no_of_accessions_per_sampling=self.no_of_accessions_per_sampling,\ coverage=self.coverage) imputedData = YuSNPData(strain_acc_list=strain_acc_list, category_list=category_list, col_id_ls=snps_name_ls, data_matrix=imputed_matrix) imputedData.tofile(self.output_fname) else: self.outputHeader(self.output_fname, strain_acc_list, category_list) chr2no_of_snps = self.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() for chromosome in chr_ls: snpData = SNPData(inFile=self.input_fname, snps_name_ls=snps_name_ls, data_matrix=data_matrix, chromosome=chromosome, \ input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation) self.run(snpData) else: snpData = SNPData( inFile=self.input_fname, input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation) self.run(snpData)
def run(self): cnvIntensityData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float) probe_pos_ls = [] avg_intensity_ls = [] if self.run_type == 1: newDataMatrix = numpy.ones(cnvIntensityData.data_matrix.shape, numpy.int) for j in range(cnvIntensityData.data_matrix.shape[1]): probe_id = cnvIntensityData.col_id_ls[j] probe_id = probe_id.split('_') probe_id = map(int, probe_id) probe_pos_ls.append(probe_id[1]) avg_intensity_ls.append(numpy.sum(cnvIntensityData.data_matrix[:,j])) if self.run_type==1: for i in range(cnvIntensityData.data_matrix.shape[0]): if cnvIntensityData.data_matrix[i][j]<=self.max_del_intensity: newDataMatrix[i][j] = -1 if self.run_type==1: newData = SNPData(row_id_ls=cnvIntensityData.row_id_ls, col_id_ls=cnvIntensityData.col_id_ls, data_matrix=newDataMatrix) newData.tofile(self.output_fname) elif self.run_type==2: block_size = 1000 no_of_probes = len(probe_pos_ls) no_of_blocks = no_of_probes/block_size for i in range(no_of_blocks): if i*block_size>no_of_probes: break start_index = i*block_size end_index = min((i+1)*block_size, no_of_probes) fname = '%s_%s_%s.png'%(self.output_fname, probe_pos_ls[start_index], probe_pos_ls[end_index]) pylab.clf() pylab.plot(probe_pos_ls[start_index:end_index], avg_intensity_ls[start_index:end_index], '.', markersize=4, alpha=0.4) pylab.xlabel('chromosome position') pylab.ylabel('sum intensity') pylab.savefig(fname, dpi=300)
def run(self): """ 2009-5-28 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) nativename2tg_ecotypeid_set = getNativename2TgEcotypeIDSet( db.metadata.bind, turnUpperCase=True) ecotype_id_set_250k_in_pipeline = get_ecotype_id_set_250k_in_pipeline( ArrayInfo) ecotypeid2tg_ecotypeid = get_ecotypeid2tg_ecotypeid(db.metadata.bind) #turn_into_integer=2 because it's not nucleotides header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.input_fname, turn_into_integer=2, matrix_data_type=float) data_matrix_phen = numpy.array(data_matrix_phen) #2009-8-19 bug here. strain_acc_list_phen is not unique for each row. causing replicates to have the same value #from Association import Association #data_matrix_phen = Association.get_phenotype_matrix_in_data_matrix_order(strain_acc_list_phen, strain_acc_list_phen, data_matrix_phen) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) ecotype_id_ls = self.straightenEcotypeID(phenData.row_id_ls, nativename2tg_ecotypeid_set, ecotypeid2tg_ecotypeid, \ ecotype_id_set_250k_in_pipeline) session = db.session session.begin() if self.run_type == 1: self.putPhenotypeIntoDB(db, phenData, ecotype_id_ls) elif self.run_type == 2: self.putReplicatePhenotypeIntoDB(db, phenData, ecotype_id_ls) else: sys.stderr.write("Unsupported run type: %s.\n" % (self.run_type)) if self.commit: session.commit()
def run(self): """ """ if self.debug: import pdb pdb.set_trace() cnvIntensityData = self.getBeforeGADAIntensityData(self.input_fname) #cnvIntensityData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float) qcData = SNPData(input_fname=self.qc_fname, turn_into_array=1, ignore_2nd_column=1) if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) for probe_id in qcData.col_id_ls: if probe_id in cnvIntensityData.col_id2col_index: cnv_col_index = cnvIntensityData.col_id2col_index[probe_id] qc_col_index = qcData.col_id2col_index[probe_id] count_ls = [] intensity_ls = [] for i in range(len(qcData.row_id_ls)): row_id = qcData.row_id_ls[i] if qcData.data_matrix[i][qc_col_index]>=0 and row_id in cnvIntensityData.row_id2row_index: cnv_row_index = cnvIntensityData.row_id2row_index[row_id] count = qcData.data_matrix[i][qc_col_index] count_ls.append(count) intensity_ls.append(cnvIntensityData.data_matrix[cnv_row_index][cnv_col_index]) count_set = set(count_ls) if len(count_set)>0 and count_set!=set([0]): pylab.clf() ax = pylab.axes([0.1, 0.1, 0.8, 0.8], frameon=False) ax.grid(True, alpha=0.3) pylab.plot(count_ls, intensity_ls, '.', markersize=5, alpha=0.4) pylab.xlabel('count') pylab.ylabel('CNV probe intensity') pylab.ylim([-1,1]) xlim = list(ax.get_xlim()) xlim[0] -= 1 xlim[1] += 1 ax.set_xlim(xlim) pylab.title(probe_id) pylab.savefig(os.path.join(self.output_dir, '%s.png'%probe_id), dpi=300)
def getHaploGroupSNPMatrix(self): """ 2009-4-18 """ sys.stderr.write("Getting HaploGroup SNP matrix ...") col_id_ls = [] row_id_ls = [] if self.debug: no_of_rows = 10 else: no_of_rows = StockDB.HaploGroup.query.count() col_id2col_index = {} for row in StockDB.SNPs.query.order_by( StockDB.SNPs.chromosome).order_by(StockDB.SNPs.position): col_id_ls.append(row.id) col_id2col_index[row.id] = len(col_id2col_index) no_of_cols = len(col_id2col_index) data_matrix = numpy.zeros([no_of_rows, no_of_cols], numpy.int8) rows = StockDB.HaploGroup.query.all() row_index = 0 for row in rows: data_rows = StockDB.FilteredCalls.query.filter_by( ecotypeid=row.ref_ecotypeid) row_index = len(row_id_ls) for one_call in data_rows: nt_number = nt2number[one_call.allele] col_index = col_id2col_index[one_call.snpid] data_matrix[row_index][col_index] = nt_number row_id_ls.append(row.id) if self.debug and row_index == no_of_rows - 1: break snpData = SNPData(col_id_ls=col_id_ls, row_id_ls=row_id_ls, data_matrix=data_matrix) sys.stderr.write("Done.\n") return snpData
def getBeforeGADAIntensityData(self, input_fname): db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session data_matrix, probe_id_ls, chr_pos_ls, header = CNVNormalize.get_input(input_fname) col_id_ls = [] for chr_pos in chr_pos_ls: col_id_ls.append('%s_%s'%(chr_pos[0], chr_pos[1])) ecotype_id_ls = [] for array_id in header[1:-2]: array = Stock_250kDB.ArrayInfo.get(int(array_id)) if array: ecotype_id = array.maternal_ecotype_id else: ecotype_id = -1 ecotype_id_ls.append('%s'%ecotype_id) cnvIntensityData = SNPData(row_id_ls=ecotype_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix.transpose()) return cnvIntensityData
def run(self): """ """ if self.debug: import pdb pdb.set_trace() snpData = SNPData(input_fname=self.inputFname, turn_into_array=1, ignore_2nd_column=1) snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([])) if self.min_MAF>0: snpData = SNPData.removeColsByMAF(snpData,min_MAF=self.min_MAF, NA_set=set([])) snpData.col_id_ls = map(int, snpData.col_id_ls) snpData.row_id_ls = map(int, snpData.row_id_ls) f = h5py.File(self.outputFname, 'w') import numpy #snpData.data_matrix.dtype = numpy.int16 dset = f.create_dataset("data_matrix", data=snpData.data_matrix, maxshape=(None, None)) #numpy.array(snpData.data_matrix, dtype=numpy.int64) col_id_ls_dset = f.create_dataset('col_id_ls', data=snpData.col_id_ls, maxshape=(None,)) row_id_ls_dset = f.create_dataset('row_id_ls', data=snpData.row_id_ls, maxshape=(None,)) f.close()
def run(self): """ 2008-11-08 generate combinations of results_id, list_type_id and generate plots one after another save the plots into database if commit=1 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session param_obj = PassingData(call_method_id=self.call_method_id, \ analysis_method_id=getattr(self, 'analysis_method_id', None),\ analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\ phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\ list_type_id_ls=self.list_type_id_ls, \ results_type=self.results_type) params_ls = MpiGeneListRankTest.generate_params(param_obj) ResultsClass, TestResultClass = db.getResultsAndTestResultsClass( results_type=self.results_type) if ResultsClass is None or TestResultClass is None: sys.stderr.write("Invalid results type : %s.\n" % pd.results_type) sys.exit(3) for results_id, list_type_id in params_ls: rm = ResultsClass.get(results_id) list_type = Stock_250kDB.GeneListType.get(list_type_id) title = 'result(%s) of %s on %s with %s(%s) list'%\ (results_id, rm.analysis_method.short_name, rm.phenotype_method.short_name, list_type.short_name, list_type.id) TopSNPTestType_id_ls = self.getTopSNPTestType_id_ls(self.get_closest, self.min_MAF, self.allow_two_sample_overlapping, self.results_type, \ self.test_type_id, self.null_distribution_type_id) if self.commit: rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\ filter_by(results_id=results_id).filter_by(list_type_id=list_type_id) if rows.count() > 0: row = rows.first() sys.stderr.write( '%s already in db (%s of them) with first id=%s.\n' % (title, rows.count(), row.id)) continue if not TopSNPTestType_id_ls: sys.stderr.write( "No TopSNPTestType matches the input requirements. Exit.\n" ) sys.exit(3) TopSNPTestType_id_ls_str = map(str, TopSNPTestType_id_ls) from_where_clause = "from %s t, %s y where t.type_id=y.id and t.results_id=%s and t.list_type_id=%s and y.id in (%s)"%\ (TestResultClass.table.name, Stock_250kDB.CandidateGeneTopSNPTestRMType.table.name,\ results_id, list_type_id, ','.join(TopSNPTestType_id_ls_str)) no_of_top_snps_info = self.get_no_of_top_snps_info( db, from_where_clause) min_distance_info = self.get_min_distance_info( db, from_where_clause) rdata = self.get_data_matrix(db, no_of_top_snps_info, min_distance_info, from_where_clause, need_other_values=True, \ null_distribution_type_id=self.null_distribution_type_id) header = ['no_of_top_snps', ''] + min_distance_info.label_ls strain_acc_list = no_of_top_snps_info.label_ls category_list = no_of_top_snps_info.label_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") #sys.exit(3) continue if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) """ if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, no_of_top_snps_info.label_ls,\ min_distance_info.label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname) """ if self.commit: output_fname_prefix = None else: title_cp = title title_cp = title_cp.replace('/', '_') output_fname_prefix = '%s_%s_type_%s.png' % (os.path.splitext( self.fig_fname)[0], title_cp, TopSNPTestType_id_ls[0]) if rm.analysis_method_id == 1 or rm.analysis_method_id == 7: preset_xlim = [0, 8] preset_xlim = None else: preset_xlim = None return_data = self.plotCurve(rdata, no_of_top_snps_info, min_distance_info, output_fname_prefix, title=title, commit=self.commit, preset_xlim=preset_xlim) if self.commit and return_data.png_data: rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\ filter_by(results_id=results_id).filter_by(list_type_id=list_type_id) if rows.count() > 0: row = rows.first() sys.stderr.write( '%s already in db (%s of them) with first id=%s.\n' % (title, rows.count(), row.id)) continue plot = Stock_250kDB.CandidateVsNonRatioPlot( type_id=TopSNPTestType_id_ls[0], results_id=results_id, list_type_id=list_type_id) plot.png_thumbnail = return_data.png_thumbnail.getvalue() plot.png_data = return_data.png_data.getvalue() plot.svg_data = return_data.svg_data.getvalue() db.session.save(plot) db.session.flush()
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session if self.test_result_type==1: test_result_class_table = CandidateGeneRankSumTestResult.table.name test_result_class_table = 'candidate_gene_rank_sum_test_result_2008_09_15' elif self.test_result_type==2: test_result_class_table = CandidateGeneTopSNPTest.table.name elif self.test_result_type==3: test_result_class_table = Stock_250kDB.CandidateGeneRankSumTestResultMethod.table.name else: sys.stderr.write(" test_result_type %s not supported.\n"%(self.test_result_type)) sys.exit(2) #the condition for min_MAF is tricky because of the floating precision. if self.test_result_type==1: where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \ and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\ %(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) elif self.test_result_type==2: where_condition = "%s r, %s rg, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null and r.id=rg.results_method_id \ and c.results_id=rg.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\ %(ResultsMethod.table.name, ResultsByGene.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) elif self.test_result_type==3: where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \ and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\ %(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF) if self.call_method_id_ls: where_condition += " and r.call_method_id in (%s)"%self.call_method_id_ls if self.analysis_method_id_ls: where_condition += " and r.analysis_method_id in (%s)"%self.analysis_method_id_ls if self.super_type_id: where_condition += " and g.super_type_id=%s"%self.super_type_id if self.test_type: where_condition += " and c.test_type=%s"%self.test_type if self.test_result_type==1: pass where_condition += " and c.max_pvalue_per_gene=%s"%(self.max_pvalue_per_gene) elif self.test_result_type==2: where_condition += " and c.no_of_top_snps=%s"%(self.no_of_top_snps) list_type_id_ls = self.getListTypeInfo(db, where_condition) analysis_method_id_ls = self.getAnalysisMethodInfo(db, where_condition) list_type_analysis_method_info = self.orderListTypeAnalysisMethodID(list_type_id_ls, analysis_method_id_ls) phenotype_info = self.getPhenotypeInfo(db, where_condition) rdata = self.get_data_matrix(db, phenotype_info, list_type_analysis_method_info, where_condition) rdata.data_matrix = self.markDataMatrixBoundary(rdata.data_matrix, phenotype_info, list_type_analysis_method_info) header = ['list_type_analysis_method', ''] + phenotype_info.phenotype_method_label_ls strain_acc_list = list_type_analysis_method_info.list_type_analysis_method_label_ls category_list = list_type_analysis_method_info.list_type_id_analysis_method_id_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, list_type_analysis_method_info.list_type_analysis_method_label_ls,\ phenotype_info.phenotype_method_label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def doFilter( self, snpData, snpData_qc_strain, snpData_qc_snp, min_call_probability, max_call_mismatch_rate, max_call_NA_rate, max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size, output_dir=None, ): """ 2009-10-11 replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 2008-12-22 replace '=' and ',' with '_' in the output filename 2008-05-19 matrix_ls has to be of length >0 before concatenation 2008-05-19 use SNPData structure 2008-05-18 add onlyCommon=True to FilterAccessions.filterByError() 2008-05-17 add argument output_dir. if it's available, output data matrix before and after imputation 2008-05-12 add qcdata.no_of_accessions_filtered_by_mismatch qcdata.no_of_accessions_filtered_by_na qcdata.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed 2008-05-11 split up from computing_node_handler """ qcdata = PassingData() twoSNPData = TwoSNPData( SNPData1=snpData, SNPData2=snpData_qc_strain, row_matching_by_which_value=0, debug=self.debug ) row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() del twoSNPData newSnpData = SNPData.removeRowsByMismatchRate(snpData, row_id2NA_mismatch_rate, max_call_mismatch_rate) qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_filtered_by_mismatch newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate) qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_filtered_by_na twoSNPData = TwoSNPData( SNPData1=newSnpData, SNPData2=snpData_qc_snp, row_matching_by_which_value=0, debug=self.debug ) col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise() del twoSNPData newSnpData = SNPData.removeColsByMismatchRate(newSnpData, col_id2NA_mismatch_rate, max_snp_mismatch_rate) qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate) qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na twoSNPData = TwoSNPData( SNPData1=newSnpData, SNPData2=snpData_qc_snp, row_matching_by_which_value=0, debug=self.debug ) newSnpData = twoSNPData.mergeTwoSNPData(priority=2) del twoSNPData # MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2) newSnpData = SNPData.removeMonomorphicCols(newSnpData) qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols # FilterSnps.filterMonomorphic(snpsd_250k_tmp) if output_dir: # output data here if not os.path.isdir(output_dir): os.makedirs(output_dir) output_fname_prefix_ls = [ "min_oligo_call_probability_%s" % min_call_probability, "max_array_mismatch_rate_%s" % max_call_mismatch_rate, "max_array_NA_rate_%s" % max_call_NA_rate, "max_snp_mismatch_rate_%s" % max_snp_mismatch_rate, "max_snp_NA_rate_%s" % max_snp_NA_rate, "npute_window_size_%s" % npute_window_size, ] output_fname = os.path.join(output_dir, "_".join(output_fname_prefix_ls + ["before_imputation.tsv"])) newSnpData.tofile(output_fname) # chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] # snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) """ qcdata.no_of_snps_filtered_by_mismatch = 0 qcdata.no_of_snps_filtered_by_na = 0 qcdata.no_of_monomorphic_snps_removed = 0 for snpsd in snpsd_250k_tmp: qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed """ # snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp) twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, row_matching_by_which_value=0) row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise() col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise() del twoSNPData0 result = [] # for npute_window_size in npute_window_size_ls: # snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp) #deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well if len(newSnpData.row_id_ls) > 5: snps_name_ls = newSnpData.col_id_ls ## 2009-10-8 use NPUTE.samplingImpute() imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute( snps_name_ls, newSnpData.data_matrix, input_file_format=1, input_NA_char=0, lower_case_for_imputation=False, npute_window_size=int(npute_window_size), no_of_accessions_per_sampling=300, coverage=3, ) snpData_imputed = SNPData( row_id_ls=newSnpData.row_id_ls, col_id_ls=new_snps_name_ls, data_matrix=imputed_matrix ) """ ## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[]) matrix_ls = [] for chromosome in chr_ls: if chr2no_of_snps[chromosome]>5: #enough for imputation npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \ input_file_format=1, input_NA_char=0) imputeData(npute_data_struc, int(npute_window_size)) matrix_ls.append(npute_data_struc.snps) snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls if len(matrix_ls)>0: snpData_imputed.data_matrix = num.transpose(num.concatenate(matrix_ls)) """ if output_dir: # 2008-05-16 write the data out if output_fname is available # chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] #already produced in the previous before_imputation output output_fname = os.path.join(output_dir, "_".join(output_fname_prefix_ls + ["after_imputation.tsv"])) # snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) snpData_imputed.tofile(output_fname) twoSNPData1 = TwoSNPData( SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, row_matching_by_which_value=0 ) qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise() qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise() del twoSNPData1, snpData_imputed else: snpData_imputed = None # qcdata.row_id2NA_mismatch_rate1 = {} # qcdata.col_id2NA_mismatch_rate1 = {} del newSnpData """ for i in range(len(snpsd_250k_tmp)): #snpsd_250k_tmp_1[i].snps = [] #clear it up if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5: #not enough for imputation npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0) imputeData(npute_data_struc, int(npute_window_size)) snpsd_250k_tmp[i].snps = npute_data_struc.snps del npute_data_struc """ qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0 qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0 qcdata.min_call_probability = min_call_probability qcdata.max_call_mismatch_rate = max_call_mismatch_rate qcdata.max_call_NA_rate = max_call_NA_rate qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate qcdata.max_snp_NA_rate = max_snp_NA_rate qcdata.npute_window_size = npute_window_size result.append(qcdata) return result
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session snpData = SNPData(input_fname=self.input_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1) if self.eigen_vector_fname and self.eigen_value_fname: eigen_value_ls = self.getEigenValueFromFile(self.eigen_value_fname) eigen_value_ls = numpy.array(eigen_value_ls) explained_var = eigen_value_ls/numpy.sum(eigen_value_ls) PC_data = self.getPCFromFile(self.eigen_vector_fname) PC_matrix = PC_data.PC_matrix else: max_no_of_snps = 10000 if len(snpData.col_id_ls)>max_no_of_snps: #2008-12-01 randomly pick max_no_of_snps SNPs picked_col_index_ls = random.sample(range(len(snpData.col_id_ls)), max_no_of_snps) new_col_id_ls = [snpData.col_id_ls[i] for i in picked_col_index_ls] newSnpData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=new_col_id_ls, strain_acc_list=snpData.strain_acc_list,\ category_list=snpData.category_list) newSnpData.data_matrix = snpData.data_matrix[:, picked_col_index_ls] snpData = newSnpData snpData, allele_index2allele_ls = snpData.convertSNPAllele2Index() explained_var = None PC_matrix = None header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0] ecotype_info = getEcotypeInfo(db, self.country_order_type) #the offset below decides where the label of strains/snps should start in axe_snp_matrix #2008-11-14 only for PlotGroupOfSNPs.py. you can set it to 1 cuz we dont' draw axe_snp_matrix here. snp_id_label_y_offset = 0.95 StrainID2PCAPosInfo = self.getStrainID2PCAPosInfo(snpData, pca_range=[0,1], snp_id_label_y_offset=snp_id_label_y_offset, explained_var=explained_var, T=PC_matrix) axe_y_offset1 = 0.03 axe_height1 = 0.45 #height of axe_chromosome, twice height of axe_map_phenotype_legend axe_y_offset2 = axe_y_offset1+axe_height1 axe_height2 = 0.5 #height of axe_strain_pca, axe_snp_matrix, axe_map axe_y_offset3 = axe_y_offset2+axe_height2 axe_x_offset1 = 0.05 axe_width1 = 0.8 #width of axe_strain_pca axe_x_offset2 = axe_x_offset1 + 0.02 + axe_width1 axe_width2 = 0.05 #width of axe_chromosome, axe_snp_matrix, axe_snp_pca axe_x_offset3 = axe_x_offset2 + axe_width2 axe_width3 = 0.02 #width of axe_phenotype phenotype_method = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id) phenotype_cmap = mpl.cm.jet max_phenotype = numpy.nanmax(phenData.data_matrix[:,phenotype_col_index]) #nanmax ignores the nan elements min_phenotype = numpy.nanmin(phenData.data_matrix[:,phenotype_col_index]) #nanmin ignores the nan elements phenotype_gap = max_phenotype - min_phenotype phenotype_jitter = phenotype_gap/10. phenotype_norm = mpl.colors.Normalize(vmin=min_phenotype-phenotype_jitter, vmax=max_phenotype+phenotype_jitter) axe_map_phenotype_legend = pylab.axes([axe_x_offset2, axe_y_offset1, axe_width2, 0.3], frameon=False) cb = mpl.colorbar.ColorbarBase(axe_map_phenotype_legend, cmap=phenotype_cmap, norm=phenotype_norm, orientation='vertical') cb.set_label('Legend Of Phenotype %s %s'%(phenotype_method.id, phenotype_method.short_name)) axe_strain_map = pylab.axes([axe_x_offset1, axe_y_offset2, axe_width1, axe_height2], frameon=False) axe_strain_pca = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1], frameon=False) axe_strain_map_pca_cover = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1+axe_height2], frameon=False, \ sharex=axe_strain_pca) #cover both axe_strain_map and axe_strain_pca axe_strain_map_pca_cover.set_yticks([]) axe_strain_pca_xlim = [-0.05,1.05] axe_strain_pca_ylim = [0, 1.05] axe_strain_pca.set_xlim(axe_strain_pca_xlim) axe_strain_pca.set_ylim(axe_strain_pca_ylim) axe_strain_map_pca_cover_ylim = [0, (axe_height1+axe_height2)/axe_height1] #set it accordingly axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim) axe_strain_pca.grid(True, alpha=0.3) axe_strain_pca.set_xticks([]) axe_strain_pca.set_yticks([]) axe_strain_pca_legend = None #no pca legend self.drawStrainPCA(axe_strain_pca, axe_strain_map, axe_strain_map_pca_cover, axe_strain_pca_legend, StrainID2PCAPosInfo, \ ecotype_info, phenData, \ phenotype_col_index, phenotype_cmap, phenotype_norm, rightmost_x_value=axe_strain_pca_xlim[1],\ strain_color_type=2, pca2map_line_color=None, ecotype_width_on_map=10,\ draw_lines_to_axe_snp_matrix = False, strain_size_on_axe_strain_pca=14, pic_area=self.pic_area,\ map_pca_line_alpha=0.2, map_pca_linewidth=0.2) #customize a couple of things axe_strain_pca.set_xlim(axe_strain_pca_xlim) axe_strain_pca.set_ylim(axe_strain_pca_ylim) axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim) png_output_fname = '%s.png'%self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg'%self.output_fname_prefix) self.plotLatLonPhenVsPC(ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, phenotype_cmap, phenotype_norm, self.output_fname_prefix, commit=self.commit)
def plone_run(self, min_call_info_mismatch_rate=0.1): """ 2009-6-9 pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db. 2009-4-13 add min_call_info_mismatch_rate 2009-2-5 add "create_tables=False" to db.setup() 2008-07-02 fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null. 2008-07-01 adjust to the newest functions in QC_250k.py 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs #database connection and etc db = Stock_250kDB.Stock_250kDB(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() # if cmp_data_filename not specified, try to find in the data_description column in table QC_method. qm = QCMethod.query.get(self.QC_method_id) if not self.cmp_data_filename and self.QC_method_id != 0: if qm.data_description: data_description_ls = qm.data_description.split('=') if len(data_description_ls) > 1: self.cmp_data_filename = qm.data_description.split( '=')[1].strip() #after db query, cmp_data_filename is still nothing, exit program. if not self.cmp_data_filename and self.QC_method_id != 0: sys.stderr.write( "cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n" ) sys.exit(3) #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het) #category_list is not used. if self.input_dir: #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) else: #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\ debug=self.debug) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return #2008-07-01 pick the call_info_ids to be handled new_call_info_id2fname = {} for call_info_id_wanted in self.call_info_id_ls: if call_info_id_wanted in call_info_id2fname: new_call_info_id2fname[ call_info_id_wanted] = call_info_id2fname[ call_info_id_wanted] elif self.report: sys.stderr.write("%s not in call_info_id2fname.\n" % (call_info_id_wanted)) call_info_id2fname = new_call_info_id2fname if call_info_id2fname: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability) header = pdata.header call_info_id_ls = pdata.call_info_id_ls array_id_ls = pdata.array_id_ls ecotype_id_ls = pdata.ecotype_id_ls data_matrix = pdata.data_matrix elif self.input_dir: #2008-07-02 #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_dir, double_header=1) ecotype_id_ls = header[0][2:] call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) data_matrix = data_matrix.transpose() header = ['', ''] + snps_name_ls #fake a header for SNPData else: #2008-07-02 sys.stderr.write("No good arrays.\n") return None snps_name2snps_id = None #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs) #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db. row_id2NA_mismatch_rate = None #2008-05-01 create a cross match table temporarily twoSNPData.qc_cross_match_table = 'qc_cross_match' twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table twoSNPData.cal_row_id2pairwise_dist( ) #database submission is done along. return row_id2NA_mismatch_rate
def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0): """ 2009-9-23 add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData. However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 2008-09-10 if self.input_fname is given, get 149SNP data from it , instead of database 2008-8-28 split out of run() so that MpiQC149CrossMatch could call this easily """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() if self.input_fname: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) else: from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info( self.QC_method_id, ignore_strains_with_qc=False) data_matrix = self.get_data_matrix( db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] #tg_ecotypeid category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] #strainid header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table if self.QC_method_id == 4: snpData2 = snpData1 else: self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report) return twoSNPData
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session order_by_sentence = " order by c.longitude, c.latitude, e.longitude, e.latitude, e.nativename " #how to order strains. if self.QC_method_id == 4: sql_table_str = "from %s e, %s s, %s a, %s c"%(StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\ StockDB.Country.table.name) common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s " + order_by_sentence strain_where_condition = common_where_condition % ( " and e.id=st.ecotypeid") strain_id_info_query = "select distinct st.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s" % ( sql_table_str, StockDB.Strain.table.name, strain_where_condition) else: sql_table_str = "from %s q, %s e, %s s, %s a, %s c"%(StockDB.QCCrossMatch.table.name, StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\ StockDB.Country.table.name) common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s"+ " and q.qc_method_id=%s and q.no_of_non_NA_pairs>=%s and q.mismatch_rate<=%s "%\ (self.QC_method_id, self.min_no_of_non_NAs, self.max_mismatch_rate) + order_by_sentence strain_where_condition = common_where_condition % ( " and e.id=st.ecotypeid and st.id=q.strainid") strain_id_info_query = "select distinct q.strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s" % ( sql_table_str, StockDB.Strain.table.name, strain_where_condition) if self.how_to_group_strains == 2 or self.how_to_group_strains == 3: plate_info = self.alignStrainsAccordingToSeqPlate(db) id_set_data = PassingData() id_set_data.strain_id_set = None id_set_data.target_id_set = None elif self.input_fname: id_set_data = self.getStrainidTargetidFromFile( db, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs) else: id_set_data = PassingData() id_set_data.strain_id_set = None id_set_data.target_id_set = None if self.how_to_group_strains == 2 or self.how_to_group_strains == 3: strain_id_info = self.getStrainInfoGivenPlateInfo( db, plate_info, strain_id_info_query, strain_id_set=None) else: strain_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set) if self.QC_method_id == 4: if self.how_to_group_strains == 3: #2008-09-15 column strain id is in country, strain-longitude order target_id_info = self.getStrainIDInfo( db, strain_id_info_query, id_set_data.strain_id_set) else: target_id_info = strain_id_info else: target_where_condition = common_where_condition % ( " and e.id=q.target_id") target_id_info_query = "select distinct e.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s %s" % ( sql_table_str, target_where_condition) target_id_info = self.getStrainIDInfo(db, target_id_info_query) if self.input_fname: rdata = self.get_data_matrixFromFile(db, strain_id_info, target_id_info, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs) else: rdata = self.get_data_matrix(db, strain_id_info, target_id_info, self.QC_method_id, self.max_mismatch_rate, self.min_no_of_non_NAs) rdata.data_matrix = self.markDataMatrixBoundary( rdata.data_matrix, strain_id_info, target_id_info) header = ['strain info', ''] + target_id_info.strain_label_ls strain_acc_list = strain_id_info.strain_label_ls category_list = [1] * len(strain_acc_list) if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor( x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, strain_id_info.strain_label_ls,\ target_id_info.strain_label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session order_by_sentence = " order by c.longitude, c.latitude, e.longitude, e.latitude, e.nativename " #how to order strains. if self.QC_method_id ==4: sql_table_str = "from %s e, %s s, %s a, %s c"%(StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\ StockDB.Country.table.name) common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s " + order_by_sentence strain_where_condition = common_where_condition%(" and e.id=st.ecotypeid") strain_id_info_query = "select distinct st.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s"%(sql_table_str, StockDB.Strain.table.name, strain_where_condition) else: sql_table_str = "from %s q, %s e, %s s, %s a, %s c"%(StockDB.QCCrossMatch.table.name, StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\ StockDB.Country.table.name) common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s"+ " and q.qc_method_id=%s and q.no_of_non_NA_pairs>=%s and q.mismatch_rate<=%s "%\ (self.QC_method_id, self.min_no_of_non_NAs, self.max_mismatch_rate) + order_by_sentence strain_where_condition = common_where_condition%(" and e.id=st.ecotypeid and st.id=q.strainid") strain_id_info_query = "select distinct q.strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s"%(sql_table_str, StockDB.Strain.table.name, strain_where_condition) if self.how_to_group_strains==2 or self.how_to_group_strains==3: plate_info = self.alignStrainsAccordingToSeqPlate(db) id_set_data = PassingData() id_set_data.strain_id_set = None id_set_data.target_id_set = None elif self.input_fname: id_set_data = self.getStrainidTargetidFromFile(db, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs) else: id_set_data = PassingData() id_set_data.strain_id_set = None id_set_data.target_id_set = None if self.how_to_group_strains==2 or self.how_to_group_strains==3: strain_id_info = self.getStrainInfoGivenPlateInfo(db, plate_info, strain_id_info_query, strain_id_set=None) else: strain_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set) if self.QC_method_id==4: if self.how_to_group_strains==3: #2008-09-15 column strain id is in country, strain-longitude order target_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set) else: target_id_info = strain_id_info else: target_where_condition = common_where_condition%(" and e.id=q.target_id") target_id_info_query = "select distinct e.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s %s"%(sql_table_str, target_where_condition) target_id_info = self.getStrainIDInfo(db, target_id_info_query) if self.input_fname: rdata = self.get_data_matrixFromFile(db, strain_id_info, target_id_info, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs) else: rdata = self.get_data_matrix(db, strain_id_info, target_id_info, self.QC_method_id, self.max_mismatch_rate, self.min_no_of_non_NAs) rdata.data_matrix = self.markDataMatrixBoundary(rdata.data_matrix, strain_id_info, target_id_info) header = ['strain info', ''] + target_id_info.strain_label_ls strain_acc_list = strain_id_info.strain_label_ls category_list = [1]*len(strain_acc_list) if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") sys.exit(3) if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, strain_id_info.strain_label_ls,\ target_id_info.strain_label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname)
def run(self): """ """ db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session if self.debug: import pdb pdb.set_trace() chr_pos2ancestral_allele = self.get_chr_pos2ancestral_allele( self.ancestral_allele_fname) pheno_data = SNPData(input_fname=self.phenotype_fname, turn_into_integer=0, ignore_2nd_column=1) pheno_data = self.process_phenotype_data(pheno_data) geno_data = SNPData(input_fname=self.genotype_fname, turn_into_array=1, matrix_data_type=int, ignore_2nd_column=1) query = Stock_250kDB.ResultsMethod.query.filter_by( call_method_id=self.call_method_id).filter_by( analysis_method_id=self.analysis_method_id).filter_by( phenotype_method_id=self.phenotype_method_id) if query.count() == 1: rm = query.first() elif query.count() > 1: sys.stderr.write( "Warning: more than 1 results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n" % (self.call_method_id, self.analysis_method_id, self.phenotype_method_id)) rm = query.first() else: sys.stderr.write( "Error: no results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n" % (self.call_method_id, self.analysis_method_id, self.phenotype_method_id)) sys.exit(3) phenotype_ls_data = self.get_phenotype_ls(rm, self.no_of_top_snps, chr_pos2ancestral_allele, pheno_data, geno_data, \ self.min_MAF, results_directory=self.input_dir) import pylab pylab.clf() hist_patch_ls = [] legend_ls = [] if len(phenotype_ls_data.ancestral_allele_phenotype_ls) > 2: n1 = pylab.hist(phenotype_ls_data.ancestral_allele_phenotype_ls, 100, alpha=0.4, normed=1) hist_patch_ls.append( n1[2][0]) #first patch in all patches of a histogram legend_ls.append('ancestral allele') if len(phenotype_ls_data.derived_allele_phenotype_ls) > 2: n2 = pylab.hist(phenotype_ls_data.derived_allele_phenotype_ls, 100, alpha=0.4, normed=1, facecolor='r') hist_patch_ls.append(n2[2][0]) legend_ls.append('derived allele') pylab.legend(hist_patch_ls, legend_ls) if self.output_fname_prefix: pylab.savefig('%s.svg' % self.output_fname_prefix, dpi=300)
def outputArray(cls, session, curs, output_dir=None, array_info_table=None, snps=None, \ probes=None, array_id_ls=[], \ xy_ls=[], chr_pos_ls=[], probes_id_ls=[],\ call_method_id=0, run_type=1, array_file_directory=None, outputCNVIntensity=True,\ returnArrayIntensityData=False): """ 2010-5-10 curs could be elixirdb.metadata.bind or MySQLdb.connect 2010-5-5 changed to classmethod add argument outputCNVIntensity: whether to output CNV intensity data, default=True. returnArrayIntensityData: whether return array CNV intensity data in a SNPData structure 2009-10-9 add argument array_file_directory. 2009-3-11 add run_type=3 calculate intensity medium of all probes in the array and store the value in db array_id_ls is a list of array_ids in str type 2009-3-5 skip if no probes (if one_snp.probes_id_ls == [-1]*4:) for that SNP (fake SNP in the SNP table) 2008-12-09 add option run_type 2008-07-12 add option array_id 2008-04-08 """ sys.stderr.write("Outputting arrays ... \n") import rpy rpy.r.library('affy') array_width = None if run_type != 3 and output_dir and not os.path.isdir( output_dir): #2010-5-5 test if output_dir is something os.makedirs(output_dir) sql_query = cls.generateSQLQueryToGetArrays(array_info_table, array_id_ls=array_id_ls, \ call_method_id=call_method_id, run_type=run_type) print sql_query rows = curs.execute(sql_query) is_elixirdb = 1 # 2010-5-10 By default, assume curs is elixirdb.metadata.bind if hasattr(curs, 'fetchall'): # 2010-5-10 curs is MySQLdb.connect rows = curs.fetchall() is_elixirdb = 0 no_of_objects = len(rows) else: no_of_objects = int(rows.rowcount) if run_type == 2: #2008-12-09 don't initialize the data_matrix if run_type is not 2 (CNV probe). data_matrix = numpy.zeros([len(probes_id_ls), no_of_objects], numpy.float32) array_id_avail_ls = [] array_label_ls = [] i = 0 for row in rows: if is_elixirdb: array_id = row.array_id filename = row.filename ecotype_id = row.maternal_ecotype_id else: array_id, filename, ecotype_id = row[:3] array_id_avail_ls.append(array_id) array_label_ls.append('%s_%s' % (array_id, ecotype_id)) if array_file_directory and os.path.isdir(array_file_directory): filename = os.path.join(array_file_directory, os.path.split(filename)[1]) sys.stderr.write("\t%d/%d: Extracting intensity from %s ... \n" % (i + 1, no_of_objects, filename)) if run_type == 1: #output SNP probe intensity within the loop output_fname = os.path.join( output_dir, '%s_array_intensity.tsv' % (array_id)) if os.path.isfile(output_fname): sys.stderr.write("\tFile %s already exists. Ignore.\n" % (output_fname)) continue #read array by calling R if array_width == None: returnData = cls.getArrayWidth(filename) intensity_array = returnData.intensity_array array = returnData.array array_width = returnData.array_width else: array = rpy.r.read_affybatch(filenames=filename) intensity_array = rpy.r.intensity( array) #return a lengthX1 2-Dimensional array. if run_type == 2: #CNV probe for j in range(len(xy_ls)): xpos, ypos = xy_ls[j] #chromosome, position = chr_pos_ls[j] intensity_array_index = array_width * (array_width - xpos - 1) + ypos #output_row = [chromosome, position] intensity = math.log10( intensity_array[intensity_array_index][0]) #output_row.append(intensity) #writer.writerow(output_row) data_matrix[j][i] = intensity elif run_type == 1: #SNP probe intensity writer = csv.writer(open(output_fname, 'w'), delimiter='\t') header = ['sense1', 'sense2', 'antisense1', 'antisense2'] func = lambda x: '%s_%s' % (array_id, x) header = map(func, header) header = ['SNP_ID'] + header writer.writerow(header) for snps_id in snps.snps_id_ls: one_snp = snps.get_one_snp(snps_id) output_row = [one_snp.snpid] if one_snp.probes_id_ls == [ -1 ] * 4: #2009-3-5 skip if no probes for that SNP (fake SNP in the SNP table) continue for probes_id in one_snp.probes_id_ls: one_probe = probes.get_one_probe(probes_id) intensity_array_index = array_width * ( array_width - one_probe.xpos - 1) + one_probe.ypos output_row.append( intensity_array[intensity_array_index][0]) writer.writerow(output_row) del writer elif run_type == 3: #calculate the intensity medium of all probes and store into db median_intensity = numpy.median(intensity_array) array_info_entry = Stock_250kDB.ArrayInfo.get(array_id) array_info_entry.median_intensity = median_intensity session.add(array_info_entry) else: sys.stderr.write("Error: run_type %s is not supported.\n" % run_type) sys.exit(3) del intensity_array, array i += 1 if run_type == 2 and outputCNVIntensity: #2008-11-13 output in Roger's multi-sample format header = ['probes_id' ] + array_id_avail_ls + ['chromosome', 'position'] output_fname = os.path.join( output_dir, 'call_method_%s_CNV_intensity.tsv' % (call_method_id)) writer = csv.writer(open(output_fname, 'w'), delimiter='\t') writer.writerow(header) for i in range(data_matrix.shape[0]): data_row = [probes_id_ls[i]] + list(data_matrix[i]) + list( chr_pos_ls[i]) writer.writerow(data_row) del writer sys.stderr.write("Done.\n") if returnArrayIntensityData: #2010-5-5 arrayIntensityData = SNPData(row_id_ls=xy_ls, col_id_ls=array_label_ls, data_matrix=data_matrix) return arrayIntensityData
def getCNVQCMatrix(self, probe_id2snp_id_ls, snp_id2tup, snpData, SNP2Col_allele, cnvIntensityData): """ 2009-2-12 """ sys.stderr.write("Getting CNV QC matricies ...") mismatch_matrix = numpy.zeros( [len(snpData.row_id_ls), len(probe_id2snp_id_ls)], numpy.int) mismatch_matrix[:] = -2 insertion_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int) insertion_matrix[:] = -2 deletion_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int) deletion_matrix[:] = -2 qc_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int) qc_matrix[:] = -2 cnv_probe_ls = probe_id2snp_id_ls.keys() cnv_probe_ls.sort() cnv_probe2index = dict(zip(cnv_probe_ls, range(len(cnv_probe_ls)))) total_disp_pos_ls = [] total_intensity_ls = [] total_mismatch_ls = [] total_insertion_ls = [] total_deletion_ls = [] total_mis_ls = [] for i in range(mismatch_matrix.shape[0]): row_id = snpData.row_id_ls[i] if row_id in cnvIntensityData.row_id2row_index: cnv_row_index = cnvIntensityData.row_id2row_index[row_id] for probe_id, snp_id_ls in probe_id2snp_id_ls.iteritems(): col_index = cnv_probe2index[probe_id] probe_id_label = '%s_%s' % (probe_id[0], probe_id[1]) cnv_col_index = cnvIntensityData.col_id2col_index[ probe_id_label] no_of_mismatches = 0 no_of_deletions = 0 no_of_insertions = 0 is_this_probe_NA = 1 disp_pos_ls = [] for snp_id, disp_pos in snp_id_ls: snp_id_tup = snp_id2tup[snp_id] disp_pos_ls.append(disp_pos) snp_col_index = snpData.col_id2col_index[snp_id] allele = snpData.data_matrix[i][snp_col_index] col_allele = SNP2Col_allele[snp_id] if allele == -2 or allele == 0: continue else: is_this_probe_NA = 0 if snp_id_tup[2] != 0: #the offset is not 0 if allele != -1: #if it's deleted, then it's nothing no_of_insertions += 1 elif allele == -1: no_of_deletions += 1 elif col_allele == -2 or col_allele == 0: sys.stderr.write("allele for this accession %s at snp %s is %s while reference allele is NA: %s.\n"%\ (snpData.row_id_ls[i], snp_id, allele, col_allele)) elif allele != col_allele: no_of_mismatches += 1 if not is_this_probe_NA: mean_disp_pos = numpy.mean(disp_pos_ls) mismatch_matrix[i][col_index] = no_of_mismatches insertion_matrix[i][col_index] = no_of_insertions deletion_matrix[i][col_index] = no_of_deletions total_mis_count = no_of_mismatches + no_of_insertions + no_of_deletions qc_matrix[i][col_index] = total_mis_count total_disp_pos_ls.append(mean_disp_pos) total_intensity_ls.append( cnvIntensityData.data_matrix[cnv_row_index] [cnv_col_index]) total_mismatch_ls.append(no_of_mismatches) total_insertion_ls.append(no_of_insertions) total_deletion_ls.append(no_of_deletions) total_mis_ls.append(total_mis_count) plotData = PassingData(total_disp_pos_ls=total_disp_pos_ls, total_intensity_ls=total_intensity_ls,\ total_mismatch_ls=total_mismatch_ls, total_insertion_ls=total_insertion_ls, total_deletion_ls=total_deletion_ls,\ total_mis_ls=total_mis_ls) mismatchData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=cnv_probe_ls, data_matrix=mismatch_matrix) insertionData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=cnv_probe_ls, data_matrix=insertion_matrix) deletionData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=cnv_probe_ls, data_matrix=deletion_matrix) qcData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=cnv_probe_ls, data_matrix=qc_matrix) sys.stderr.write("Done.\n") return PassingData(mismatchData=mismatchData, insertionData=insertionData, deletionData=deletionData, qcData=qcData, plotData=plotData)
def doFilter(self, snpData, snpData_qc_strain, snpData_qc_snp, min_call_probability, max_call_mismatch_rate, max_call_NA_rate,\ max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size , output_dir=None): """ 2009-10-11 replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 2008-12-22 replace '=' and ',' with '_' in the output filename 2008-05-19 matrix_ls has to be of length >0 before concatenation 2008-05-19 use SNPData structure 2008-05-18 add onlyCommon=True to FilterAccessions.filterByError() 2008-05-17 add argument output_dir. if it's available, output data matrix before and after imputation 2008-05-12 add qcdata.no_of_accessions_filtered_by_mismatch qcdata.no_of_accessions_filtered_by_na qcdata.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed 2008-05-11 split up from computing_node_handler """ qcdata = PassingData() twoSNPData = TwoSNPData(SNPData1=snpData, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0, debug=self.debug) row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() del twoSNPData newSnpData = SNPData.removeRowsByMismatchRate(snpData, row_id2NA_mismatch_rate, max_call_mismatch_rate) qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_removed newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate) qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_removed twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \ row_matching_by_which_value=0, debug=self.debug) col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise() del twoSNPData newSnpData = SNPData.removeColsByMismatchRate(newSnpData, col_id2NA_mismatch_rate, max_snp_mismatch_rate) qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate) qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \ row_matching_by_which_value=0, debug=self.debug) newSnpData = twoSNPData.mergeTwoSNPData(priority=2) del twoSNPData #MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2) newSnpData = SNPData.removeMonomorphicCols(newSnpData) qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols #FilterSnps.filterMonomorphic(snpsd_250k_tmp) if output_dir: #output data here if not os.path.isdir(output_dir): os.makedirs(output_dir) output_fname_prefix_ls = ['min_oligo_call_probability_%s'%min_call_probability,\ 'max_array_mismatch_rate_%s'%max_call_mismatch_rate,\ 'max_array_NA_rate_%s'%max_call_NA_rate,\ 'max_snp_mismatch_rate_%s'%max_snp_mismatch_rate,\ 'max_snp_NA_rate_%s'%max_snp_NA_rate,\ 'npute_window_size_%s'%npute_window_size] output_fname = os.path.join( output_dir, '_'.join(output_fname_prefix_ls + ['before_imputation.tsv'])) newSnpData.tofile(output_fname) #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) """ qcdata.no_of_snps_filtered_by_mismatch = 0 qcdata.no_of_snps_filtered_by_na = 0 qcdata.no_of_monomorphic_snps_removed = 0 for snpsd in snpsd_250k_tmp: qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed """ #snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp) twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0) row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise() col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise() del twoSNPData0 result = [] #for npute_window_size in npute_window_size_ls: #snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp) #deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well if len(newSnpData.row_id_ls) > 5: snps_name_ls = newSnpData.col_id_ls ## 2009-10-8 use NPUTE.samplingImpute() imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute(snps_name_ls, newSnpData.data_matrix, \ input_file_format=1, input_NA_char=0, lower_case_for_imputation=False,\ npute_window_size=int(npute_window_size), \ no_of_accessions_per_sampling=300, coverage=3) snpData_imputed = SNPData(row_id_ls=newSnpData.row_id_ls, col_id_ls=new_snps_name_ls, data_matrix=imputed_matrix) """ ## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[]) matrix_ls = [] for chromosome in chr_ls: if chr2no_of_snps[chromosome]>5: #enough for imputation npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \ input_file_format=1, input_NA_char=0) imputeData(npute_data_struc, int(npute_window_size)) matrix_ls.append(npute_data_struc.snps) snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls if len(matrix_ls)>0: snpData_imputed.data_matrix = numpy.transpose(numpy.concatenate(matrix_ls)) """ if output_dir: #2008-05-16 write the data out if output_fname is available #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] #already produced in the previous before_imputation output output_fname = os.path.join( output_dir, '_'.join(output_fname_prefix_ls + ['after_imputation.tsv'])) #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) snpData_imputed.tofile(output_fname) twoSNPData1 = TwoSNPData(SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0) qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise() qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise() del twoSNPData1, snpData_imputed else: snpData_imputed = None #qcdata.row_id2NA_mismatch_rate1 = {} #qcdata.col_id2NA_mismatch_rate1 = {} del newSnpData """ for i in range(len(snpsd_250k_tmp)): #snpsd_250k_tmp_1[i].snps = [] #clear it up if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5: #not enough for imputation npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0) imputeData(npute_data_struc, int(npute_window_size)) snpsd_250k_tmp[i].snps = npute_data_struc.snps del npute_data_struc """ qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0 qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0 qcdata.min_call_probability = min_call_probability qcdata.max_call_mismatch_rate = max_call_mismatch_rate qcdata.max_call_NA_rate = max_call_NA_rate qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate qcdata.max_snp_NA_rate = max_snp_NA_rate qcdata.npute_window_size = npute_window_size result.append(qcdata) return result
def loadDataStructure(self, db_250k=None, association_locus_id=None, association_landscape_type_id=None, \ locusExtensionDistance=5000,\ data_dir=None, list_type_id_list=None, gene_annotation_pickleFname=None, \ snpInfoPickleFname=None, locus_type_id=1, snp_matrix_fname=None, snp_matrix_data_type=None, \ phenotype_fname=None): """ 2012.11.14 """ sys.stderr.write("Fetching GWAS landscape for association-locus %s, landscape type %s ..."%(association_locus_id, association_landscape_type_id)) # fetch the associationLocus associationLocus = Stock_250kDB.AssociationLocus.get(association_locus_id) associationLandscapeType = Stock_250kDB.AssociationLandscapeType.get(association_landscape_type_id) # fetch all result-peaks landscape_gwr_ls = [] # fetch landscape within this interval start = max(1, associationLocus.start-locusExtensionDistance) stop = associationLocus.stop + locusExtensionDistance pd = PassingData(min_MAF=associationLandscapeType.min_MAF, data_dir=data_dir, \ need_chr_pos_ls=0, chromosome=associationLocus.chromosome, \ start=start, stop=stop, report=False) #report controls whether getResultMethodContent() will report progress. association_landscape_id_set = set() for association_peak in associationLocus.association_peak_ls: association_landscape = db_250k.getAssociationLandscape(result_id=association_peak.result_id, association_landscape_type_id=associationLandscapeType.id) if association_landscape and association_landscape.id not in association_landscape_id_set: association_landscape_id_set.add(association_landscape.id) genome_wide_result = db_250k.getResultMethodContent(association_landscape=association_landscape, data_dir=data_dir, \ construct_chr_pos2index=True, pdata=pd) landscape_gwr_ls.append(genome_wide_result) sys.stderr.write(" %s%s "%('\x08'*80, len(landscape_gwr_ls))) sys.stderr.write("%s landscapes.\n"%(len(landscape_gwr_ls))) centralLocus = SNPPassingData(chromosome=associationLocus.chromosome, position=start, \ snps_id=associationLocus.id, start=start, stop=stop, fileNamePrefix="") LD_info = None gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(gene_annotation_pickleFname) if snpInfoPickleFname: snp_info = db_250k.dealWithSNPInfo(snpInfoPickleFname, locus_type_id=locus_type_id) #2012.3.8 else: snp_info = None candidate_gene_set = set() if list_type_id_list: for list_type_id in list_type_id_list: candidate_gene_list = db_250k.getGeneList(list_type_id) candidate_gene_set |= set(candidate_gene_list) if snp_matrix_fname and phenotype_fname: if snp_matrix_data_type==3: matrix_data_type=float #2009-3-23 for CNV amplitude file else: matrix_data_type=int snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1,\ matrix_data_type=matrix_data_type) if snpData.data_matrix is None: sys.stderr.write("Error. snpData.data_matrix is None.\n") sys.exit(3) header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, \ strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen #2008-12-05 fake a snp_info for findSNPsInRegion DrawSNPRegion.construct_chr_pos2index_forSNPData(snpData, snp_info=snp_info) ecotype_info = getEcotypeInfo(db_250k) else: snpData = None phenData = None ecotype_info = None return_data = PassingData(associationLocus=associationLocus, associationLandscapeType=associationLandscapeType, \ landscape_gwr_ls=landscape_gwr_ls, \ gene_annotation=gene_annotation, snp_info=snp_info, LD_info=LD_info, \ candidate_gene_set=candidate_gene_set, snpData=snpData, phenData=phenData,\ ecotype_info=ecotype_info, centralLocus=centralLocus) return return_data
def run(self): """ 2008-09-06 """ if self.debug: #for one-node testing purpose import pdb pdb.set_trace() header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData) other_data_pickle = cPickle.dumps(other_data, -1) phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1) snpData_pickle = cPickle.dumps(snpData, -1) sys.exit(2) self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size-1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size-1 if node_rank == 0: dstruc = self.inputNodePrepare() params_ls = dstruc.params_ls #send the output node the phenotype_label_ls self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0) del dstruc.output_node_data_pickle for node in free_computing_nodes: #send it to the computing_node sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node)) self.communicator.send(dstruc.snpData_pickle, node, 0) self.communicator.send(dstruc.other_data_pickle, node, 0) sys.stderr.write(".\n") del dstruc elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data data, source, tag = self.communicator.receiveString(0, 0) other_data = cPickle.loads(data) del data self.phenotype_index_ls = other_data.phenotype_index_ls else: data, source, tag = self.communicator.receiveString(0, 0) output_node_data_pickle = cPickle.loads(data) phenotype_label_ls = output_node_data_pickle.phenotype_label_ls self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \ gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData, phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point, test_type=self.test_type) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes) self.synchronize() #to avoid some node early exits
def run(self): """ 2008-11-08 generate combinations of results_id, list_type_id and generate plots one after another save the plots into database if commit=1 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup() session = db.session param_obj = PassingData(call_method_id=self.call_method_id, \ analysis_method_id=getattr(self, 'analysis_method_id', None),\ analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\ phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\ list_type_id_ls=self.list_type_id_ls, \ results_type=self.results_type) params_ls = MpiGeneListRankTest.generate_params(param_obj) for results_id, list_type_id in params_ls: rm = Stock_250kDB.ResultsMethod.get(results_id) list_type = Stock_250kDB.GeneListType.get(list_type_id) title = 'result(%s) of %s on %s with %s(%s) list'%\ (results_id, rm.analysis_method.short_name, rm.phenotype_method.short_name, list_type.short_name, list_type.id) TopSNPTestType_id_ls = self.getTopSNPTestType_id_ls(self.get_closest, self.min_MAF, self.allow_two_sample_overlapping, self.results_type, \ self.test_type_id, self.null_distribution_type_id) if self.commit: rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\ filter_by(results_id=results_id).filter_by(list_type_id=list_type_id) if rows.count()>0: row = rows.first() sys.stderr.write('%s already in db (%s of them) with first id=%s.\n'%(title, rows.count(), row.id)) continue if not TopSNPTestType_id_ls: sys.stderr.write("No TopSNPTestType matches the input requirements. Exit.\n") sys.exit(3) TopSNPTestType_id_ls_str = map(str, TopSNPTestType_id_ls) from_where_clause = "from %s t, %s y where t.type_id=y.id and t.results_id=%s and t.list_type_id=%s and y.id in (%s)"%\ (Stock_250kDB.CandidateGeneTopSNPTestRM.table.name, Stock_250kDB.CandidateGeneTopSNPTestRMType.table.name,\ results_id, list_type_id, ','.join(TopSNPTestType_id_ls_str)) no_of_top_snps_info = self.get_no_of_top_snps_info(db, from_where_clause) min_distance_info = self.get_min_distance_info(db, from_where_clause) rdata = self.get_data_matrix(db, no_of_top_snps_info, min_distance_info, from_where_clause, need_other_values=True, \ null_distribution_type_id=self.null_distribution_type_id) header = ['no_of_top_snps', ''] + min_distance_info.label_ls strain_acc_list = no_of_top_snps_info.label_ls category_list = no_of_top_snps_info.label_ls if SNPData.isDataMatrixEmpty(rdata.data_matrix): sys.stderr.write("Nothing fetched from database.\n") #sys.exit(3) continue if self.output_fname: write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list) """ if self.fig_fname: font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value) im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font) #im.save('%s_legend.png'%self.fig_fname_prefix) im = drawMatrix(rdata.data_matrix, value2color_func, no_of_top_snps_info.label_ls,\ min_distance_info.label_ls, with_grid=1, font=font) im = combineTwoImages(im, im_legend, font=font) im.save(self.fig_fname) """ if self.commit: output_fname_prefix = None else: title_cp = title title_cp = title_cp.replace('/', '_') output_fname_prefix='%s_%s_type_%s.png'%(os.path.splitext(self.fig_fname)[0], title_cp, TopSNPTestType_id_ls[0]) if rm.analysis_method_id ==1 or rm.analysis_method_id==7: preset_xlim = [0,8] else: preset_xlim = None return_data = self.plotCurve(rdata, no_of_top_snps_info, min_distance_info, output_fname_prefix, title=title, commit=self.commit, preset_xlim=preset_xlim) if self.commit and return_data.png_data: rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\ filter_by(results_id=results_id).filter_by(list_type_id=list_type_id) if rows.count()>0: row = rows.first() sys.stderr.write('%s already in db (%s of them) with first id=%s.\n'%(title, rows.count(), row.id)) continue plot = Stock_250kDB.CandidateVsNonRatioPlot(type_id=TopSNPTestType_id_ls[0], results_id=results_id, list_type_id=list_type_id) plot.png_thumbnail = return_data.png_thumbnail.getvalue() plot.png_data = return_data.png_data.getvalue() plot.svg_data = return_data.svg_data.getvalue() db.session.save(plot) db.session.flush()
def run(self): self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size - 1 """ if node_rank!=output_node_rank: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching """ if node_rank == 0: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching snpData_pickle = cPickle.dumps(snpData, -1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write( "passing initial data to nodes from %s to %s ... " % (node_rank, node)) self.communicator.send(snpData_pickle, node, 0) sys.stderr.write(".\n") del snpData_pickle params_ls = self.generate_params(len(snpData.col_id_ls), self.block_size) del snpData elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data else: pass self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator=params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_handler, message_size=1) #self.input_node(param_obj, free_computing_nodes, self.message_size) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData( snpData=snpData, min_LD_to_output=self.min_LD_to_output, min_MAF=self.min_MAF, discard_perc=self.discard_perc) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') #header_row = ['snp1_id', 'snp2_id', 'r2', 'D', "D'", "no_of_pairs"] #writer.writerow(header_row) else: writer = None param_obj = PassingData(writer=writer, is_header_written=False) self.output_node(free_computing_nodes, param_obj, self.output_node_handler) del writer self.synchronize() #to avoid some node early exits
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used. readme = formReadmeObj(sys.argv, self.ad, StockDB.README) session.save(readme) import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: #twoSNPData.save_col_wise(session, readme) #2008-08-18 need to implement a new one for 149SNP row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \ twoSNPData.row_id12row_id2, readme) if self.commit: session.commit() else: session.rollback()