def run(self): """ 2008-06-02 """ if self.debug: import pdb pdb.set_trace() if self.row_matching_by_which_value == 0: snpData1 = SNPData(input_fname=self.input_fname1, turn_into_array=1, ignore_2nd_column=1) else: snpData1 = SNPData(input_fname=self.input_fname1, turn_into_array=1) snpData2 = SNPData(input_fname=self.input_fname2, turn_into_array=1) if self.row_matching_by_which_value == 1 or self.row_matching_by_which_value == 2: row_matching_by_which_value = self.row_matching_by_which_value - 1 else: row_matching_by_which_value = None twoSNPData = TwoSNPData( SNPData1=snpData1, SNPData2=snpData2, debug=self.debug, row_matching_by_which_value=row_matching_by_which_value) newSnpData = twoSNPData.order2ndSNPDataRowsSameAs1stSNPData() newSnpData.tofile(self.output_fname)
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData( phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def run(self): """ 2008-08-11 the database interface changed in variation.src.dbsnp 2008-05-06 """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() if self.debug: import pdb pdb.set_trace() db = DBSNP(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) session = db.session session.begin() #transaction = session.create_transaction() snps_name2possible_mappings, snps_name2snps_id = self.get_snps_name2possible_mappings( db) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_fname1) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ col_id2id=snps_name2snps_id, snps_table='dbsnp.snps') header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_fname2) snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix,\ snps_table='stock_250k.snps') twoSNPData = TwoSNPData384(SNPData1=snpData1, SNPData2=snpData2, curs=curs, user=self.user) readme = formReadmeObj(sys.argv, self.ad, README) session.save(readme) session.flush() twoSNPData.figureOutABMapping(session, readme, snps_name2possible_mappings) if self.commit: curs.execute("commit") session.commit() else: session.rollback()
def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() #database connection and etc db = self.db_250k session = db.session session.begin() delimiter = figureOutDelimiter(self.inputFname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.inputFname, delimiter=delimiter) if self.snp_id_type == 1: #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs. # but if col-id is already chr_pos, it's fine. new_header = header[:2] data_matrix_col_index_to_be_kept = [] for i in xrange(2, len(header)): snp_id = header[i] chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, ) if chr_pos is not None: data_matrix_col_index_to_be_kept.append(i - 2) new_header.append(chr_pos) # to remove no-db_id columns from data matrix data_matrix = numpy.array(data_matrix) data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept] header = new_header if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [ rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls ] snpsdata.writeRawSnpsDatasToFile(self.outputFname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def run(self): """ 2009-2-12 """ if self.debug: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() chr2CNV_probe_ls_pickle_fname = '/tmp/chr2CNV_probe_ls.pickle' if not os.path.isfile(chr2CNV_probe_ls_pickle_fname): chr2CNV_probe_ls = self.get_chr2CNV_probe_ls( curs, self.probes_table) picklef = open(chr2CNV_probe_ls_pickle_fname, 'w') cPickle.dump(chr2CNV_probe_ls, picklef, -1) del picklef else: picklef = open(chr2CNV_probe_ls_pickle_fname, 'r') chr2CNV_probe_ls = cPickle.load(picklef) del picklef snpData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1) probeData = self.get_probe_id2snp_id_ls(chr2CNV_probe_ls, snpData.col_id_ls) SNP2Col_allele = self.get_SNP2Col_allele(snpData) cnvIntensityData = SNPData(input_fname=self.cnv_input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float) cnvQCData = self.getCNVQCMatrix(probeData.probe_id2snp_id_ls, probeData.snp_id2tup, snpData, SNP2Col_allele, cnvIntensityData) plotdata_pickle_fname = '/tmp/CNV_plot_data.pickle' picklef = open(plotdata_pickle_fname, 'w') cPickle.dump(cnvQCData.plotData, picklef, -1) del picklef cnvQCData.mismatchData.tofile('%s_mismatch.tsv' % self.output_fname_prefix) cnvQCData.insertionData.tofile('%s_insertion.tsv' % self.output_fname_prefix) cnvQCData.deletionData.tofile('%s_deletion.tsv' % self.output_fname_prefix) cnvQCData.qcData.tofile('%s_qc.tsv' % self.output_fname_prefix)
def run(self): cnvIntensityData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float) probe_pos_ls = [] avg_intensity_ls = [] if self.run_type == 1: newDataMatrix = numpy.ones(cnvIntensityData.data_matrix.shape, numpy.int) for j in range(cnvIntensityData.data_matrix.shape[1]): probe_id = cnvIntensityData.col_id_ls[j] probe_id = probe_id.split('_') probe_id = map(int, probe_id) probe_pos_ls.append(probe_id[1]) avg_intensity_ls.append( numpy.sum(cnvIntensityData.data_matrix[:, j])) if self.run_type == 1: for i in range(cnvIntensityData.data_matrix.shape[0]): if cnvIntensityData.data_matrix[i][ j] <= self.max_del_intensity: newDataMatrix[i][j] = -1 if self.run_type == 1: newData = SNPData(row_id_ls=cnvIntensityData.row_id_ls, col_id_ls=cnvIntensityData.col_id_ls, data_matrix=newDataMatrix) newData.tofile(self.output_fname) elif self.run_type == 2: block_size = 1000 no_of_probes = len(probe_pos_ls) no_of_blocks = no_of_probes / block_size for i in range(no_of_blocks): if i * block_size > no_of_probes: break start_index = i * block_size end_index = min((i + 1) * block_size, no_of_probes) fname = '%s_%s_%s.png' % (self.output_fname, probe_pos_ls[start_index], probe_pos_ls[end_index]) pylab.clf() pylab.plot(probe_pos_ls[start_index:end_index], avg_intensity_ls[start_index:end_index], '.', markersize=4, alpha=0.4) pylab.xlabel('chromosome position') pylab.ylabel('sum intensity') pylab.savefig(fname, dpi=300)
def loadDataStructure(self, gene_annotation_picklef, min_MAF=0, min_distance=20000, \ list_type_id=None, snp_matrix_fname=None, snp_matrix_data_type=1): """ 2009-5-30 add argument snp_matrix_fname 2008-11-25 2008-10-01 wrap a few functions up, convenient for both run() and drawSNPRegion() """ db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) self.db = db snp_info = self.getSNPInfo(db) gene_annotation = self.dealWithGeneAnnotation(gene_annotation_picklef) if list_type_id: candidate_gene_list = self.getGeneList(list_type_id) candidate_gene_set = Set(candidate_gene_list) else: candidate_gene_set = Set() if snp_matrix_fname: if snp_matrix_data_type==3: matrix_data_type=float #2009-3-23 for CNV amplitude file else: matrix_data_type=int snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=matrix_data_type) #2008-12-05 fake a snp_info for findSNPsInRegion self.construct_chr_pos2index_forSNPData(snpData) else: snpData = None return_data = PassingData(gene_annotation=gene_annotation, snp_info=snp_info, \ candidate_gene_set=candidate_gene_set, snpData=snpData) return return_data
def qcDataMatrixVSsnpData(self, pdata, snps_name2snps_id, snpData2, curs, session, readme): """ 2008-08-16 split from run() to enable one_by_one option """ #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=pdata.header, strain_acc_list=pdata.ecotype_id_ls, category_list=pdata.call_info_id_ls, data_matrix=pdata.data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: twoSNPData.save_col_wise(session, readme) row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) passingdata = PassingData() passingdata.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate passingdata.row_id12row_id2 = twoSNPData.row_id12row_id2 return passingdata
def run(self): """ """ if self.debug: import pdb pdb.set_trace() snpData = SNPData(input_fname=self.inputFname, turn_into_array=1, ignore_2nd_column=1) snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([])) if self.min_MAF and self.min_MAF > 0: snpData = SNPData.removeColsByMAF(snpData, min_MAF=self.min_MAF, NA_set=set([])) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.writer.makeupHeaderFromSampleIDList( sampleIDList=snpData.row_id_ls) self.writer.writeMetaAndHeader() counter = 0 for j in xrange(len(snpData.col_id_ls)): snp_id = snpData.col_id_ls[j] chromosome, start = snp_id.split('_')[:2] genotype_ls = snpData.data_matrix[:, j] genotype_ls = utils.dict_map(number2di_nt, genotype_ls) genotype_ls_vcf = [] alleleNucleotide2Number = {} alleleNumber2Nucleotide = {} for genotype in genotype_ls: if genotype == 'NA': genotype_ls_vcf.append("./.") elif len(genotype) == 2: for allele in genotype: if allele not in alleleNucleotide2Number: alleleNumber = len(alleleNucleotide2Number) alleleNucleotide2Number[allele] = alleleNumber alleleNumber2Nucleotide[alleleNumber] = allele genotype_ls_vcf.append( "%s/%s" % (alleleNucleotide2Number[genotype[0]], alleleNucleotide2Number[genotype[1]])) else: genotype_ls_vcf.append("./.") refAllele = alleleNumber2Nucleotide[0] if 1 not in alleleNumber2Nucleotide: altAllele = refAllele else: altAllele = alleleNumber2Nucleotide[1] row = [ chromosome, start, ".", refAllele, altAllele, 999, 'PASS', "DP=100", "GT" ] + genotype_ls_vcf self.writer.writerow(row) counter += 1 sys.stderr.write(" %s records.\n" % (counter)) self.writer.close()
def create_init_data(self): """ 2009-6-5 add argument ignore_het=1 to snpData_2010_149_384 & snpData_perlegen 2008-05-12 initial data loading on node 0 """ init_data = PassingData() init_data.snpData_250k = SNPData(input_fname=self.input_fname, turn_into_array=1) init_data.snpData_2010_149_384 = SNPData( input_fname=self.fname_2010_149_384, turn_into_array=1, ignore_2nd_column=1, ignore_het=1) init_data.snpData_perlegen = SNPData(input_fname=self.fname_perlegen, turn_into_array=1, ignore_2nd_column=1, ignore_het=1) param_d = self.generate_parameters(self.parameter_names) init_data.param_d = param_d return init_data
def run(self): """ 2009-5-28 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) nativename2tg_ecotypeid_set = getNativename2TgEcotypeIDSet( db.metadata.bind, turnUpperCase=True) ecotype_id_set_250k_in_pipeline = get_ecotype_id_set_250k_in_pipeline( ArrayInfo) ecotypeid2tg_ecotypeid = get_ecotypeid2tg_ecotypeid(db.metadata.bind) #turn_into_integer=2 because it's not nucleotides header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.input_fname, turn_into_integer=2, matrix_data_type=float) data_matrix_phen = numpy.array(data_matrix_phen) #2009-8-19 bug here. strain_acc_list_phen is not unique for each row. causing replicates to have the same value #from Association import Association #data_matrix_phen = Association.get_phenotype_matrix_in_data_matrix_order(strain_acc_list_phen, strain_acc_list_phen, data_matrix_phen) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) ecotype_id_ls = self.straightenEcotypeID(phenData.row_id_ls, nativename2tg_ecotypeid_set, ecotypeid2tg_ecotypeid, \ ecotype_id_set_250k_in_pipeline) session = db.session session.begin() if self.run_type == 1: self.putPhenotypeIntoDB(db, phenData, ecotype_id_ls) elif self.run_type == 2: self.putReplicatePhenotypeIntoDB(db, phenData, ecotype_id_ls) else: sys.stderr.write("Unsupported run type: %s.\n" % (self.run_type)) if self.commit: session.commit()
def run(self): """ """ if self.debug: import pdb pdb.set_trace() cnvIntensityData = self.getBeforeGADAIntensityData(self.input_fname) #cnvIntensityData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float) qcData = SNPData(input_fname=self.qc_fname, turn_into_array=1, ignore_2nd_column=1) if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) for probe_id in qcData.col_id_ls: if probe_id in cnvIntensityData.col_id2col_index: cnv_col_index = cnvIntensityData.col_id2col_index[probe_id] qc_col_index = qcData.col_id2col_index[probe_id] count_ls = [] intensity_ls = [] for i in range(len(qcData.row_id_ls)): row_id = qcData.row_id_ls[i] if qcData.data_matrix[i][qc_col_index]>=0 and row_id in cnvIntensityData.row_id2row_index: cnv_row_index = cnvIntensityData.row_id2row_index[row_id] count = qcData.data_matrix[i][qc_col_index] count_ls.append(count) intensity_ls.append(cnvIntensityData.data_matrix[cnv_row_index][cnv_col_index]) count_set = set(count_ls) if len(count_set)>0 and count_set!=set([0]): pylab.clf() ax = pylab.axes([0.1, 0.1, 0.8, 0.8], frameon=False) ax.grid(True, alpha=0.3) pylab.plot(count_ls, intensity_ls, '.', markersize=5, alpha=0.4) pylab.xlabel('count') pylab.ylabel('CNV probe intensity') pylab.ylim([-1,1]) xlim = list(ax.get_xlim()) xlim[0] -= 1 xlim[1] += 1 ax.set_xlim(xlim) pylab.title(probe_id) pylab.savefig(os.path.join(self.output_dir, '%s.png'%probe_id), dpi=300)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() snpData = SNPData(input_fname=self.inputFname, turn_into_array=1, ignore_2nd_column=1) snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([])) if self.min_MAF>0: snpData = SNPData.removeColsByMAF(snpData,min_MAF=self.min_MAF, NA_set=set([])) snpData.col_id_ls = map(int, snpData.col_id_ls) snpData.row_id_ls = map(int, snpData.row_id_ls) f = h5py.File(self.outputFname, 'w') import numpy #snpData.data_matrix.dtype = numpy.int16 dset = f.create_dataset("data_matrix", data=snpData.data_matrix, maxshape=(None, None)) #numpy.array(snpData.data_matrix, dtype=numpy.int64) col_id_ls_dset = f.create_dataset('col_id_ls', data=snpData.col_id_ls, maxshape=(None,)) row_id_ls_dset = f.create_dataset('row_id_ls', data=snpData.row_id_ls, maxshape=(None,)) f.close()
def getPhenotypeData(cls, curs, phenotype_avg_table=None, phenotype_method_table=None, ecotype_table='stock.ecotype', get_raw_data=1,\ getPublicPhenotype=False): """ 2012.9.28 add argument getPublicPhenotype 2009-2-2 wrap up all other 3 methods """ phenotype_info = cls.get_phenotype_method_id_info(curs, phenotype_avg_table=phenotype_avg_table, \ phenotype_method_table=phenotype_method_table, getPublicPhenotype=getPublicPhenotype) ecotype_id2index, ecotype_id_ls, ecotype_name_ls = cls.get_ecotype_id2info(curs, phenotype_avg_table=phenotype_avg_table,\ ecotype_table=ecotype_table, getPublicPhenotype=getPublicPhenotype) data_matrix = cls.get_matrix(curs, phenotype_avg_table, ecotype_id2index=ecotype_id2index, phenotype_info=phenotype_info, \ get_raw_data=get_raw_data, phenotype_method_table=phenotype_method_table,\ getPublicPhenotype=getPublicPhenotype) pheno_data = SNPData(col_id_ls=phenotype_info.phenotype_id_ls, row_id_ls=ecotype_id_ls, data_matrix=data_matrix) pheno_data.row_label_ls = ecotype_name_ls pheno_data.col_label_ls = phenotype_info.method_id_name_ls return pheno_data
def getHaploGroupSNPMatrix(self): """ 2009-4-18 """ sys.stderr.write("Getting HaploGroup SNP matrix ...") col_id_ls = [] row_id_ls = [] if self.debug: no_of_rows = 10 else: no_of_rows = StockDB.HaploGroup.query.count() col_id2col_index = {} for row in StockDB.SNPs.query.order_by( StockDB.SNPs.chromosome).order_by(StockDB.SNPs.position): col_id_ls.append(row.id) col_id2col_index[row.id] = len(col_id2col_index) no_of_cols = len(col_id2col_index) data_matrix = numpy.zeros([no_of_rows, no_of_cols], numpy.int8) rows = StockDB.HaploGroup.query.all() row_index = 0 for row in rows: data_rows = StockDB.FilteredCalls.query.filter_by( ecotypeid=row.ref_ecotypeid) row_index = len(row_id_ls) for one_call in data_rows: nt_number = nt2number[one_call.allele] col_index = col_id2col_index[one_call.snpid] data_matrix[row_index][col_index] = nt_number row_id_ls.append(row.id) if self.debug and row_index == no_of_rows - 1: break snpData = SNPData(col_id_ls=col_id_ls, row_id_ls=row_id_ls, data_matrix=data_matrix) sys.stderr.write("Done.\n") return snpData
def run(self): """ 2008-9-7 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.mapping_fname: #output allele_index2allele_ls self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname) newSnpData.tofile(self.output_fname)
def getBeforeGADAIntensityData(self, input_fname): db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session data_matrix, probe_id_ls, chr_pos_ls, header = CNVNormalize.get_input(input_fname) col_id_ls = [] for chr_pos in chr_pos_ls: col_id_ls.append('%s_%s'%(chr_pos[0], chr_pos[1])) ecotype_id_ls = [] for array_id in header[1:-2]: array = Stock_250kDB.ArrayInfo.get(int(array_id)) if array: ecotype_id = array.maternal_ecotype_id else: ecotype_id = -1 ecotype_id_ls.append('%s'%ecotype_id) cnvIntensityData = SNPData(row_id_ls=ecotype_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix.transpose()) return cnvIntensityData
def run(self): """ """ db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session if self.debug: import pdb pdb.set_trace() chr_pos2ancestral_allele = self.get_chr_pos2ancestral_allele( self.ancestral_allele_fname) pheno_data = SNPData(input_fname=self.phenotype_fname, turn_into_integer=0, ignore_2nd_column=1) pheno_data = self.process_phenotype_data(pheno_data) geno_data = SNPData(input_fname=self.genotype_fname, turn_into_array=1, matrix_data_type=int, ignore_2nd_column=1) query = Stock_250kDB.ResultsMethod.query.filter_by( call_method_id=self.call_method_id).filter_by( analysis_method_id=self.analysis_method_id).filter_by( phenotype_method_id=self.phenotype_method_id) if query.count() == 1: rm = query.first() elif query.count() > 1: sys.stderr.write( "Warning: more than 1 results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n" % (self.call_method_id, self.analysis_method_id, self.phenotype_method_id)) rm = query.first() else: sys.stderr.write( "Error: no results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n" % (self.call_method_id, self.analysis_method_id, self.phenotype_method_id)) sys.exit(3) phenotype_ls_data = self.get_phenotype_ls(rm, self.no_of_top_snps, chr_pos2ancestral_allele, pheno_data, geno_data, \ self.min_MAF, results_directory=self.input_dir) import pylab pylab.clf() hist_patch_ls = [] legend_ls = [] if len(phenotype_ls_data.ancestral_allele_phenotype_ls) > 2: n1 = pylab.hist(phenotype_ls_data.ancestral_allele_phenotype_ls, 100, alpha=0.4, normed=1) hist_patch_ls.append( n1[2][0]) #first patch in all patches of a histogram legend_ls.append('ancestral allele') if len(phenotype_ls_data.derived_allele_phenotype_ls) > 2: n2 = pylab.hist(phenotype_ls_data.derived_allele_phenotype_ls, 100, alpha=0.4, normed=1, facecolor='r') hist_patch_ls.append(n2[2][0]) legend_ls.append('derived allele') pylab.legend(hist_patch_ls, legend_ls) if self.output_fname_prefix: pylab.savefig('%s.svg' % self.output_fname_prefix, dpi=300)
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index1 = self.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id1]))[0] phenotype_col_index2 = self.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id2]))[0] x_ls = [] y_ls = [] for i in range(phenData.data_matrix.shape[0]): if not numpy.isnan( phenData.data_matrix[i] [phenotype_col_index1]) and not numpy.isnan( phenData.data_matrix[i][phenotype_col_index2]): x_ls.append(phenData.data_matrix[i][phenotype_col_index1]) y_ls.append(phenData.data_matrix[i][phenotype_col_index2]) pylab.clf() pylab.title('Phenotype Contrast') pylab.plot(x_ls, y_ls, '.', alpha=0.6) pylab.grid(alpha=0.3) phenotype_method1 = Stock_250kDB.PhenotypeMethod.get( self.phenotype_method_id1) phenotype_method2 = Stock_250kDB.PhenotypeMethod.get( self.phenotype_method_id2) pylab.xlabel(phenotype_method1.short_name) pylab.ylabel(phenotype_method2.short_name) #draw diagonal line to show perfect correlation max_min_value = max(min(x_ls), min(y_ls)) min_max_value = min(max(x_ls), max(y_ls)) pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7) png_output_fname = '%s.png' % self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg' % self.output_fname_prefix)
def doFilter(self, snpData, snpData_qc_strain, snpData_qc_snp, min_call_probability, max_call_mismatch_rate, max_call_NA_rate,\ max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size , output_dir=None): """ 2009-10-11 replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 2008-12-22 replace '=' and ',' with '_' in the output filename 2008-05-19 matrix_ls has to be of length >0 before concatenation 2008-05-19 use SNPData structure 2008-05-18 add onlyCommon=True to FilterAccessions.filterByError() 2008-05-17 add argument output_dir. if it's available, output data matrix before and after imputation 2008-05-12 add qcdata.no_of_accessions_filtered_by_mismatch qcdata.no_of_accessions_filtered_by_na qcdata.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed 2008-05-11 split up from computing_node_handler """ qcdata = PassingData() twoSNPData = TwoSNPData(SNPData1=snpData, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0, debug=self.debug) row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() del twoSNPData newSnpData = SNPData.removeRowsByMismatchRate(snpData, row_id2NA_mismatch_rate, max_call_mismatch_rate) qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_removed newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate) qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_removed twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \ row_matching_by_which_value=0, debug=self.debug) col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise() del twoSNPData newSnpData = SNPData.removeColsByMismatchRate(newSnpData, col_id2NA_mismatch_rate, max_snp_mismatch_rate) qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate) qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \ row_matching_by_which_value=0, debug=self.debug) newSnpData = twoSNPData.mergeTwoSNPData(priority=2) del twoSNPData #MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2) newSnpData = SNPData.removeMonomorphicCols(newSnpData) qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols #FilterSnps.filterMonomorphic(snpsd_250k_tmp) if output_dir: #output data here if not os.path.isdir(output_dir): os.makedirs(output_dir) output_fname_prefix_ls = ['min_oligo_call_probability_%s'%min_call_probability,\ 'max_array_mismatch_rate_%s'%max_call_mismatch_rate,\ 'max_array_NA_rate_%s'%max_call_NA_rate,\ 'max_snp_mismatch_rate_%s'%max_snp_mismatch_rate,\ 'max_snp_NA_rate_%s'%max_snp_NA_rate,\ 'npute_window_size_%s'%npute_window_size] output_fname = os.path.join( output_dir, '_'.join(output_fname_prefix_ls + ['before_imputation.tsv'])) newSnpData.tofile(output_fname) #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) """ qcdata.no_of_snps_filtered_by_mismatch = 0 qcdata.no_of_snps_filtered_by_na = 0 qcdata.no_of_monomorphic_snps_removed = 0 for snpsd in snpsd_250k_tmp: qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed """ #snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp) twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0) row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise() col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise() del twoSNPData0 result = [] #for npute_window_size in npute_window_size_ls: #snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp) #deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well if len(newSnpData.row_id_ls) > 5: snps_name_ls = newSnpData.col_id_ls ## 2009-10-8 use NPUTE.samplingImpute() imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute(snps_name_ls, newSnpData.data_matrix, \ input_file_format=1, input_NA_char=0, lower_case_for_imputation=False,\ npute_window_size=int(npute_window_size), \ no_of_accessions_per_sampling=300, coverage=3) snpData_imputed = SNPData(row_id_ls=newSnpData.row_id_ls, col_id_ls=new_snps_name_ls, data_matrix=imputed_matrix) """ ## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[]) matrix_ls = [] for chromosome in chr_ls: if chr2no_of_snps[chromosome]>5: #enough for imputation npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \ input_file_format=1, input_NA_char=0) imputeData(npute_data_struc, int(npute_window_size)) matrix_ls.append(npute_data_struc.snps) snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls if len(matrix_ls)>0: snpData_imputed.data_matrix = numpy.transpose(numpy.concatenate(matrix_ls)) """ if output_dir: #2008-05-16 write the data out if output_fname is available #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))] #already produced in the previous before_imputation output output_fname = os.path.join( output_dir, '_'.join(output_fname_prefix_ls + ['after_imputation.tsv'])) #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True) snpData_imputed.tofile(output_fname) twoSNPData1 = TwoSNPData(SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, \ row_matching_by_which_value=0) qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise() qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise() del twoSNPData1, snpData_imputed else: snpData_imputed = None #qcdata.row_id2NA_mismatch_rate1 = {} #qcdata.col_id2NA_mismatch_rate1 = {} del newSnpData """ for i in range(len(snpsd_250k_tmp)): #snpsd_250k_tmp_1[i].snps = [] #clear it up if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5: #not enough for imputation npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0) imputeData(npute_data_struc, int(npute_window_size)) snpsd_250k_tmp[i].snps = npute_data_struc.snps del npute_data_struc """ qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0 qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0 qcdata.min_call_probability = min_call_probability qcdata.max_call_mismatch_rate = max_call_mismatch_rate qcdata.max_call_NA_rate = max_call_NA_rate qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate qcdata.max_snp_NA_rate = max_snp_NA_rate qcdata.npute_window_size = npute_window_size result.append(qcdata) return result
def run(self): """ 2008-09-06 """ if self.debug: #for one-node testing purpose import pdb pdb.set_trace() header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls( snps_context_wrapper) gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData) other_data_pickle = cPickle.dumps(other_data, -1) phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1) snpData_pickle = cPickle.dumps(snpData, -1) sys.exit(2) self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size - 1 if node_rank == 0: dstruc = self.inputNodePrepare() params_ls = dstruc.params_ls #send the output node the phenotype_label_ls self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0) del dstruc.output_node_data_pickle for node in free_computing_nodes: #send it to the computing_node sys.stderr.write( "passing initial data to nodes from %s to %s ... " % (node_rank, node)) self.communicator.send(dstruc.snpData_pickle, node, 0) self.communicator.send(dstruc.other_data_pickle, node, 0) sys.stderr.write(".\n") del dstruc elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data data, source, tag = self.communicator.receiveString(0, 0) other_data = cPickle.loads(data) del data self.phenotype_index_ls = other_data.phenotype_index_ls else: data, source, tag = self.communicator.receiveString(0, 0) output_node_data_pickle = cPickle.loads(data) phenotype_label_ls = output_node_data_pickle.phenotype_label_ls self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator=params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \ gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData, phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point, test_type=self.test_type) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes) self.synchronize() #to avoid some node early exits
def run(self): self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size-1) #exclude the 1st and last node free_computing_node_set = set(free_computing_nodes) output_node_rank = self.communicator.size-1 if node_rank == 0: if self.debug: #for one-node testing purpose import pdb pdb.set_trace() db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db_250k.setup(create_tables=False) session = db_250k.session # 2010-9-30 get total number of arrays in this CNV method non_duplicate_array_id_ls = CNVMergeAcrossArrays.getNonDuplicateArraysWithHighestMedianIntensity(db_250k, \ self.cnv_method_id, table_name=Stock_250kDB.CNVArrayCall.table.name) non_duplicate_array_id_set = set(non_duplicate_array_id_ls) no_of_total_arrays = len(non_duplicate_array_id_ls) # read in the SNP set with only arrays in the CNV method set snpData = SNPData(input_fname=self.input_fname, turn_into_array=1) row_index_to_be_kept = [] for row_id, row_index in snpData.row_id2row_index.iteritems(): array_id = int(row_id[1]) if array_id in non_duplicate_array_id_set: row_index_to_be_kept.append(row_index) snpData = snpData.keepRowsByRowIndex(snpData, row_index_to_be_kept) # a map between array_id and its row index in the SNP dataset array_id2row_index = {} for row_id, row_index in snpData.row_id2row_index.iteritems(): array_id = int(row_id[1]) array_id2row_index[array_id] = row_index # create a map (RBDict) between each CNV and its nearby SNPs # get all CNVs from db CNVRBdict = self.createCNVRBDict(db_250k, self.cnv_method_id, self.max_CNV_SNP_dist, array_id2row_index = array_id2row_index, \ snp_id_ls = snpData.col_id_ls) snpData.array_id2row_index = array_id2row_index # passed to computer node later snpData_pickle = cPickle.dumps(snpData, -1) snpData_pickle = zlib.compress(snpData_pickle) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node)) self.communicator.send(snpData_pickle, node, 0) sys.stderr.write(".\n") del snpData_pickle del snpData params_ls = self.generate_params(CNVRBdict,) elif node_rank in free_computing_node_set: db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db_250k.setup(create_tables=False) session = db_250k.session data, source, tag = self.communicator.receiveString(0, 0) data = zlib.decompress(data) # 2010-10-1 decompress snpData = cPickle.loads(data) del data else: pass self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls, message_size=self.message_size) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(snpData=snpData, min_LD_to_output=self.min_LD_to_output, \ min_MAF=self.min_MAF, discard_perc=self.discard_perc, db_250k=db_250k, \ array_id2row_index=snpData.array_id2row_index) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') else: writer = None param_obj = PassingData(writer=writer, is_header_written=False) self.output_node(free_computing_nodes, param_obj, self.output_node_handler) del writer self.synchronize() #to avoid some node early exits
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used. readme = formReadmeObj(sys.argv, self.ad, StockDB.README) session.save(readme) import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: #twoSNPData.save_col_wise(session, readme) #2008-08-18 need to implement a new one for 149SNP row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \ twoSNPData.row_id12row_id2, readme) if self.commit: session.commit() else: session.rollback()
def loadDataStructure(self, db_250k=None, association_locus_id=None, association_landscape_type_id=None, \ locusExtensionDistance=5000,\ data_dir=None, list_type_id_list=None, gene_annotation_pickleFname=None, \ snpInfoPickleFname=None, locus_type_id=1, snp_matrix_fname=None, snp_matrix_data_type=None, \ phenotype_fname=None): """ 2012.11.14 """ sys.stderr.write("Fetching GWAS landscape for association-locus %s, landscape type %s ..."%(association_locus_id, association_landscape_type_id)) # fetch the associationLocus associationLocus = Stock_250kDB.AssociationLocus.get(association_locus_id) associationLandscapeType = Stock_250kDB.AssociationLandscapeType.get(association_landscape_type_id) # fetch all result-peaks landscape_gwr_ls = [] # fetch landscape within this interval start = max(1, associationLocus.start-locusExtensionDistance) stop = associationLocus.stop + locusExtensionDistance pd = PassingData(min_MAF=associationLandscapeType.min_MAF, data_dir=data_dir, \ need_chr_pos_ls=0, chromosome=associationLocus.chromosome, \ start=start, stop=stop, report=False) #report controls whether getResultMethodContent() will report progress. association_landscape_id_set = set() for association_peak in associationLocus.association_peak_ls: association_landscape = db_250k.getAssociationLandscape(result_id=association_peak.result_id, association_landscape_type_id=associationLandscapeType.id) if association_landscape and association_landscape.id not in association_landscape_id_set: association_landscape_id_set.add(association_landscape.id) genome_wide_result = db_250k.getResultMethodContent(association_landscape=association_landscape, data_dir=data_dir, \ construct_chr_pos2index=True, pdata=pd) landscape_gwr_ls.append(genome_wide_result) sys.stderr.write(" %s%s "%('\x08'*80, len(landscape_gwr_ls))) sys.stderr.write("%s landscapes.\n"%(len(landscape_gwr_ls))) centralLocus = SNPPassingData(chromosome=associationLocus.chromosome, position=start, \ snps_id=associationLocus.id, start=start, stop=stop, fileNamePrefix="") LD_info = None gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(gene_annotation_pickleFname) if snpInfoPickleFname: snp_info = db_250k.dealWithSNPInfo(snpInfoPickleFname, locus_type_id=locus_type_id) #2012.3.8 else: snp_info = None candidate_gene_set = set() if list_type_id_list: for list_type_id in list_type_id_list: candidate_gene_list = db_250k.getGeneList(list_type_id) candidate_gene_set |= set(candidate_gene_list) if snp_matrix_fname and phenotype_fname: if snp_matrix_data_type==3: matrix_data_type=float #2009-3-23 for CNV amplitude file else: matrix_data_type=int snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1,\ matrix_data_type=matrix_data_type) if snpData.data_matrix is None: sys.stderr.write("Error. snpData.data_matrix is None.\n") sys.exit(3) header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, \ strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen #2008-12-05 fake a snp_info for findSNPsInRegion DrawSNPRegion.construct_chr_pos2index_forSNPData(snpData, snp_info=snp_info) ecotype_info = getEcotypeInfo(db_250k) else: snpData = None phenData = None ecotype_info = None return_data = PassingData(associationLocus=associationLocus, associationLandscapeType=associationLandscapeType, \ landscape_gwr_ls=landscape_gwr_ls, \ gene_annotation=gene_annotation, snp_info=snp_info, LD_info=LD_info, \ candidate_gene_set=candidate_gene_set, snpData=snpData, phenData=phenData,\ ecotype_info=ecotype_info, centralLocus=centralLocus) return return_data
def getCNVQCMatrix(self, probe_id2snp_id_ls, snp_id2tup, snpData, SNP2Col_allele, cnvIntensityData): """ 2009-2-12 """ sys.stderr.write("Getting CNV QC matricies ...") mismatch_matrix = numpy.zeros( [len(snpData.row_id_ls), len(probe_id2snp_id_ls)], numpy.int) mismatch_matrix[:] = -2 insertion_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int) insertion_matrix[:] = -2 deletion_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int) deletion_matrix[:] = -2 qc_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int) qc_matrix[:] = -2 cnv_probe_ls = probe_id2snp_id_ls.keys() cnv_probe_ls.sort() cnv_probe2index = dict(zip(cnv_probe_ls, range(len(cnv_probe_ls)))) total_disp_pos_ls = [] total_intensity_ls = [] total_mismatch_ls = [] total_insertion_ls = [] total_deletion_ls = [] total_mis_ls = [] for i in range(mismatch_matrix.shape[0]): row_id = snpData.row_id_ls[i] if row_id in cnvIntensityData.row_id2row_index: cnv_row_index = cnvIntensityData.row_id2row_index[row_id] for probe_id, snp_id_ls in probe_id2snp_id_ls.iteritems(): col_index = cnv_probe2index[probe_id] probe_id_label = '%s_%s' % (probe_id[0], probe_id[1]) cnv_col_index = cnvIntensityData.col_id2col_index[ probe_id_label] no_of_mismatches = 0 no_of_deletions = 0 no_of_insertions = 0 is_this_probe_NA = 1 disp_pos_ls = [] for snp_id, disp_pos in snp_id_ls: snp_id_tup = snp_id2tup[snp_id] disp_pos_ls.append(disp_pos) snp_col_index = snpData.col_id2col_index[snp_id] allele = snpData.data_matrix[i][snp_col_index] col_allele = SNP2Col_allele[snp_id] if allele == -2 or allele == 0: continue else: is_this_probe_NA = 0 if snp_id_tup[2] != 0: #the offset is not 0 if allele != -1: #if it's deleted, then it's nothing no_of_insertions += 1 elif allele == -1: no_of_deletions += 1 elif col_allele == -2 or col_allele == 0: sys.stderr.write("allele for this accession %s at snp %s is %s while reference allele is NA: %s.\n"%\ (snpData.row_id_ls[i], snp_id, allele, col_allele)) elif allele != col_allele: no_of_mismatches += 1 if not is_this_probe_NA: mean_disp_pos = numpy.mean(disp_pos_ls) mismatch_matrix[i][col_index] = no_of_mismatches insertion_matrix[i][col_index] = no_of_insertions deletion_matrix[i][col_index] = no_of_deletions total_mis_count = no_of_mismatches + no_of_insertions + no_of_deletions qc_matrix[i][col_index] = total_mis_count total_disp_pos_ls.append(mean_disp_pos) total_intensity_ls.append( cnvIntensityData.data_matrix[cnv_row_index] [cnv_col_index]) total_mismatch_ls.append(no_of_mismatches) total_insertion_ls.append(no_of_insertions) total_deletion_ls.append(no_of_deletions) total_mis_ls.append(total_mis_count) plotData = PassingData(total_disp_pos_ls=total_disp_pos_ls, total_intensity_ls=total_intensity_ls,\ total_mismatch_ls=total_mismatch_ls, total_insertion_ls=total_insertion_ls, total_deletion_ls=total_deletion_ls,\ total_mis_ls=total_mis_ls) mismatchData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=cnv_probe_ls, data_matrix=mismatch_matrix) insertionData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=cnv_probe_ls, data_matrix=insertion_matrix) deletionData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=cnv_probe_ls, data_matrix=deletion_matrix) qcData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=cnv_probe_ls, data_matrix=qc_matrix) sys.stderr.write("Done.\n") return PassingData(mismatchData=mismatchData, insertionData=insertionData, deletionData=deletionData, qcData=qcData, plotData=plotData)
def outputArray(cls, session, curs, output_dir=None, array_info_table=None, snps=None, \ probes=None, array_id_ls=[], \ xy_ls=[], chr_pos_ls=[], probes_id_ls=[],\ call_method_id=0, run_type=1, array_file_directory=None, outputCNVIntensity=True,\ returnArrayIntensityData=False): """ 2010-5-10 curs could be elixirdb.metadata.bind or MySQLdb.connect 2010-5-5 changed to classmethod add argument outputCNVIntensity: whether to output CNV intensity data, default=True. returnArrayIntensityData: whether return array CNV intensity data in a SNPData structure 2009-10-9 add argument array_file_directory. 2009-3-11 add run_type=3 calculate intensity medium of all probes in the array and store the value in db array_id_ls is a list of array_ids in str type 2009-3-5 skip if no probes (if one_snp.probes_id_ls == [-1]*4:) for that SNP (fake SNP in the SNP table) 2008-12-09 add option run_type 2008-07-12 add option array_id 2008-04-08 """ sys.stderr.write("Outputting arrays ... \n") import rpy rpy.r.library('affy') array_width = None if run_type != 3 and output_dir and not os.path.isdir( output_dir): #2010-5-5 test if output_dir is something os.makedirs(output_dir) sql_query = cls.generateSQLQueryToGetArrays(array_info_table, array_id_ls=array_id_ls, \ call_method_id=call_method_id, run_type=run_type) print sql_query rows = curs.execute(sql_query) is_elixirdb = 1 # 2010-5-10 By default, assume curs is elixirdb.metadata.bind if hasattr(curs, 'fetchall'): # 2010-5-10 curs is MySQLdb.connect rows = curs.fetchall() is_elixirdb = 0 no_of_objects = len(rows) else: no_of_objects = int(rows.rowcount) if run_type == 2: #2008-12-09 don't initialize the data_matrix if run_type is not 2 (CNV probe). data_matrix = numpy.zeros([len(probes_id_ls), no_of_objects], numpy.float32) array_id_avail_ls = [] array_label_ls = [] i = 0 for row in rows: if is_elixirdb: array_id = row.array_id filename = row.filename ecotype_id = row.maternal_ecotype_id else: array_id, filename, ecotype_id = row[:3] array_id_avail_ls.append(array_id) array_label_ls.append('%s_%s' % (array_id, ecotype_id)) if array_file_directory and os.path.isdir(array_file_directory): filename = os.path.join(array_file_directory, os.path.split(filename)[1]) sys.stderr.write("\t%d/%d: Extracting intensity from %s ... \n" % (i + 1, no_of_objects, filename)) if run_type == 1: #output SNP probe intensity within the loop output_fname = os.path.join( output_dir, '%s_array_intensity.tsv' % (array_id)) if os.path.isfile(output_fname): sys.stderr.write("\tFile %s already exists. Ignore.\n" % (output_fname)) continue #read array by calling R if array_width == None: returnData = cls.getArrayWidth(filename) intensity_array = returnData.intensity_array array = returnData.array array_width = returnData.array_width else: array = rpy.r.read_affybatch(filenames=filename) intensity_array = rpy.r.intensity( array) #return a lengthX1 2-Dimensional array. if run_type == 2: #CNV probe for j in range(len(xy_ls)): xpos, ypos = xy_ls[j] #chromosome, position = chr_pos_ls[j] intensity_array_index = array_width * (array_width - xpos - 1) + ypos #output_row = [chromosome, position] intensity = math.log10( intensity_array[intensity_array_index][0]) #output_row.append(intensity) #writer.writerow(output_row) data_matrix[j][i] = intensity elif run_type == 1: #SNP probe intensity writer = csv.writer(open(output_fname, 'w'), delimiter='\t') header = ['sense1', 'sense2', 'antisense1', 'antisense2'] func = lambda x: '%s_%s' % (array_id, x) header = map(func, header) header = ['SNP_ID'] + header writer.writerow(header) for snps_id in snps.snps_id_ls: one_snp = snps.get_one_snp(snps_id) output_row = [one_snp.snpid] if one_snp.probes_id_ls == [ -1 ] * 4: #2009-3-5 skip if no probes for that SNP (fake SNP in the SNP table) continue for probes_id in one_snp.probes_id_ls: one_probe = probes.get_one_probe(probes_id) intensity_array_index = array_width * ( array_width - one_probe.xpos - 1) + one_probe.ypos output_row.append( intensity_array[intensity_array_index][0]) writer.writerow(output_row) del writer elif run_type == 3: #calculate the intensity medium of all probes and store into db median_intensity = numpy.median(intensity_array) array_info_entry = Stock_250kDB.ArrayInfo.get(array_id) array_info_entry.median_intensity = median_intensity session.add(array_info_entry) else: sys.stderr.write("Error: run_type %s is not supported.\n" % run_type) sys.exit(3) del intensity_array, array i += 1 if run_type == 2 and outputCNVIntensity: #2008-11-13 output in Roger's multi-sample format header = ['probes_id' ] + array_id_avail_ls + ['chromosome', 'position'] output_fname = os.path.join( output_dir, 'call_method_%s_CNV_intensity.tsv' % (call_method_id)) writer = csv.writer(open(output_fname, 'w'), delimiter='\t') writer.writerow(header) for i in range(data_matrix.shape[0]): data_row = [probes_id_ls[i]] + list(data_matrix[i]) + list( chr_pos_ls[i]) writer.writerow(data_row) del writer sys.stderr.write("Done.\n") if returnArrayIntensityData: #2010-5-5 arrayIntensityData = SNPData(row_id_ls=xy_ls, col_id_ls=array_label_ls, data_matrix=data_matrix) return arrayIntensityData
def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0): """ 2009-9-23 add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData. However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 2008-09-10 if self.input_fname is given, get 149SNP data from it , instead of database 2008-8-28 split out of run() so that MpiQC149CrossMatch could call this easily """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() if self.input_fname: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) else: from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info( self.QC_method_id, ignore_strains_with_qc=False) data_matrix = self.get_data_matrix( db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] #tg_ecotypeid category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] #strainid header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table if self.QC_method_id == 4: snpData2 = snpData1 else: self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report) return twoSNPData
def run(self): """ 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ #database connection and etc db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, self.QCMethod_class) qm = self.QCMethod_class.query.get(self.QC_method_id) #2009-5-20 import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs if self.debug: import pdb pdb.set_trace() readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README) session.add(readme) QC_method_id2snps_table = self.QC_method_id2snps_table if self.QC_method_id == 0: self.cal_independent_NA_rate(db, self.min_probability, readme) row_id2NA_mismatch_rate = None else: #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename, ignore_het=qm.ignore_het) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id),\ ignore_het=qm.ignore_het) #category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided. """ if self.input_dir and os.path.isdir(self.input_dir): #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) """ if self.input_dir and os.path.isfile(self.input_dir): #it's file call_info_id2fname = None else: if self.run_type == 2: #no filtering on call_info entries that have been QCed. filter_calls_QCed = 0 elif self.run_type == 1: filter_calls_QCed = 1 self.max_call_info_mismatch_rate = 1 #don't use this when doing accession-wise QC else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug,\ min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return if self.run_type == 2: snps_name2snps_id = self.get_snps_name2snps_id(db) else: snps_name2snps_id = None if call_info_id2fname: db_id2chr_pos = db.getSNPID2ChrPos() #2011-22 from DB_250k2data import DB_250k2Data db_id2index = DB_250k2Data.getSNPID2index( call_info_id2fname.values()[0][1], db_id2chr_pos) if self.one_by_one and self.run_type == 1: #one_by_one only for QC by accession row_id2NA_mismatch_rate = {} row_id12row_id2 = {} counter = 0 for call_info_id, value in call_info_id2fname.iteritems(): counter += 1 print "No", counter tmp_dict = {} tmp_dict[call_info_id] = value pdata = self.read_call_matrix( tmp_dict, self.min_probability, db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate.update( passingdata.row_id2NA_mismatch_rate) row_id12row_id2.update(passingdata.row_id12row_id2) del pdata if self.debug and counter == 10: break else: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability, db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata else: #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = read_data( self.input_dir, double_header=1, ignore_het=qm.ignore_het) pdata = PassingData() pdata.ecotype_id_ls = header[0][2:] pdata.call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) pdata.data_matrix = data_matrix.transpose() pdata.header = ['', '' ] + snps_name_ls #fake a header for SNPData passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, \ row_id12row_id2, self.call_method_id, readme) if self.commit: curs.execute("commit") session.commit() else: session.rollback() self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate #for plone to get the data structure
def run(self): self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size - 1 """ if node_rank!=output_node_rank: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching """ if node_rank == 0: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching snpData_pickle = cPickle.dumps(snpData, -1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write( "passing initial data to nodes from %s to %s ... " % (node_rank, node)) self.communicator.send(snpData_pickle, node, 0) sys.stderr.write(".\n") del snpData_pickle params_ls = self.generate_params(len(snpData.col_id_ls), self.block_size) del snpData elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data else: pass self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator=params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_handler, message_size=1) #self.input_node(param_obj, free_computing_nodes, self.message_size) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData( snpData=snpData, min_LD_to_output=self.min_LD_to_output, min_MAF=self.min_MAF, discard_perc=self.discard_perc) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') #header_row = ['snp1_id', 'snp2_id', 'r2', 'D', "D'", "no_of_pairs"] #writer.writerow(header_row) else: writer = None param_obj = PassingData(writer=writer, is_header_written=False) self.output_node(free_computing_nodes, param_obj, self.output_node_handler) del writer self.synchronize() #to avoid some node early exits
def plone_run(self, min_call_info_mismatch_rate=0.1): """ 2009-6-9 pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db. 2009-4-13 add min_call_info_mismatch_rate 2009-2-5 add "create_tables=False" to db.setup() 2008-07-02 fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null. 2008-07-01 adjust to the newest functions in QC_250k.py 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs #database connection and etc db = Stock_250kDB.Stock_250kDB(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() # if cmp_data_filename not specified, try to find in the data_description column in table QC_method. qm = QCMethod.query.get(self.QC_method_id) if not self.cmp_data_filename and self.QC_method_id != 0: if qm.data_description: data_description_ls = qm.data_description.split('=') if len(data_description_ls) > 1: self.cmp_data_filename = qm.data_description.split( '=')[1].strip() #after db query, cmp_data_filename is still nothing, exit program. if not self.cmp_data_filename and self.QC_method_id != 0: sys.stderr.write( "cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n" ) sys.exit(3) #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het) #category_list is not used. if self.input_dir: #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) else: #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\ debug=self.debug) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return #2008-07-01 pick the call_info_ids to be handled new_call_info_id2fname = {} for call_info_id_wanted in self.call_info_id_ls: if call_info_id_wanted in call_info_id2fname: new_call_info_id2fname[ call_info_id_wanted] = call_info_id2fname[ call_info_id_wanted] elif self.report: sys.stderr.write("%s not in call_info_id2fname.\n" % (call_info_id_wanted)) call_info_id2fname = new_call_info_id2fname if call_info_id2fname: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability) header = pdata.header call_info_id_ls = pdata.call_info_id_ls array_id_ls = pdata.array_id_ls ecotype_id_ls = pdata.ecotype_id_ls data_matrix = pdata.data_matrix elif self.input_dir: #2008-07-02 #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_dir, double_header=1) ecotype_id_ls = header[0][2:] call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) data_matrix = data_matrix.transpose() header = ['', ''] + snps_name_ls #fake a header for SNPData else: #2008-07-02 sys.stderr.write("No good arrays.\n") return None snps_name2snps_id = None #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs) #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db. row_id2NA_mismatch_rate = None #2008-05-01 create a cross match table temporarily twoSNPData.qc_cross_match_table = 'qc_cross_match' twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table twoSNPData.cal_row_id2pairwise_dist( ) #database submission is done along. return row_id2NA_mismatch_rate