def run(self): """ """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() isq_id2data ={} no_of_total_lines = 0 no_of_isqf_lines = 0 no_of_isqf_in_db = 0 for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() colName2Index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True) isq_id_index = colName2Index.get('isq_id') isqf_id_index = colName2Index.get('isqf_id') read_count_index = colName2Index.get("read_count") base_count_index = colName2Index.get("base_count") for row in reader: isq_id = int(row[isq_id_index]) isqf_id = row[isqf_id_index] read_count = int(row[read_count_index]) base_count = int(row[base_count_index]) if isq_id not in isq_id2data: isq_id2data[isq_id] = PassingData(read_count=0, base_count=0) isq_id2data[isq_id].read_count += read_count isq_id2data[isq_id].base_count += base_count if isqf_id and isqf_id!='0': isqf_id = int(isqf_id) no_of_isqf_lines += 1 no_of_isqf_in_db += self.updateIndividualSequenceFileReadBaseCount(self.db_vervet, isqf_id=isqf_id, \ read_count=read_count, base_count=base_count) no_of_total_lines += 1 del reader logMsg1="%s isqf out of %s were put into db. %s lines in total.\n"%(no_of_isqf_in_db, no_of_isqf_lines, no_of_total_lines) sys.stderr.write(logMsg1) counter = 0 real_counter = 0 for isq_id, data in isq_id2data.iteritems(): real_counter += self.updateIndividualSequenceReadBaseCount(self.db_vervet, isq_id=isq_id, \ read_count=data.read_count, base_count=data.base_count, genomeSize=self.genomeSize) counter += 1 logMsg2="%s isq out of %s were put into db.\n"%(real_counter, counter) sys.stderr.write(logMsg2) if self.logFilename: logF = open(self.logFilename, 'w') logF.write(logMsg1) logF.write(logMsg2) del logF if self.commit: self.db_vervet.session.flush() self.db_vervet.session.commit()
def run(self): if self.debug: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd) curs = conn.cursor() if self.ecotype_duplicate2tg_ecotypeid_table: ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(curs, self.ecotype_duplicate2tg_ecotypeid_table) else: ecotype_duplicate2tg_ecotypeid = None from pymodule import figureOutDelimiter delimiter = figureOutDelimiter(self.input_fname) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid) ecotypeid2nativename = get_ecotypeid2nativename(curs, ecotype_table=self.ecotype_table) tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \ ecotypeid2nativename, self.stat_output_fname) tg_nativename_ls = [] for ecotypeid in tg_ecotypeid_ls: tg_nativename_ls.append(ecotypeid2nativename[ecotypeid]) header[1] = 'nativename' write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
def run(self): if self.debug: import pdb pdb.set_trace() inconsistent_rate_ls = [] for inputFname in self.inputFnameLs: if os.path.isfile(inputFname): try: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) inconsistent_rate_index = col_name2index.get("inconsistency") for row in reader: inconsistency = float(row[inconsistent_rate_index]) inconsistent_rate_ls.append(inconsistency) del reader except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if self.title is None: title = "histogram of inconsistent rate from %s refs"%(len(inconsistent_rate_ls)) else: title = self.title if len(inconsistent_rate_ls)>10: medianInconsistentRate = numpy.median(inconsistent_rate_ls) title += " median %.4f"%(medianInconsistentRate) yh_matplotlib.drawHist(inconsistent_rate_ls, title=title, \ xlabel_1D="Inconsistent Rate", xticks=None, outputFname=self.outputFname, min_no_of_data_points=20, needLog=False, \ dpi=200)
def run(self): if self.debug: import pdb pdb.set_trace() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow(['#sampleID', 'chromosome', 'meanDepth', 'medianDepth']) for inputFname in self.inputFnameLs: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = reader.next() col_name2index = getColName2IndexFromHeader(header) intervalIDIndex = col_name2index.get("Target") #only the first read group among the output (so don't run the DepthOfCoverageWalker over multi-read-group bam files avgCoverageIndex = 4 sampleID = header[avgCoverageIndex][:-9] #this column header is like $sampleID_mean_cvg. so get rid of _mean_cvg medianCoverageIndex = 6 for row in reader: intervalID = row[intervalIDIndex] writer.writerow([sampleID, intervalID, row[avgCoverageIndex], row[medianCoverageIndex]]) del writer sys.stderr.write("Done.\n")
def readDataMatrix(self, inputFname, minExprSumPerGene=180): """ 2012.5.8 """ sys.stderr.write("Reading the gene expression matrix from %s ..."%(inputFname)) suffix = os.path.splitext(inputFname)[1] if suffix=='.gz': import gzip inf = gzip.open(inputFname, 'r') else: inf = open(inputFname, 'r') reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) header = reader.next() #first line is taken as header colName2Index = getColName2IndexFromHeader(header) data_matrix = [] row_id_ls = [] counter = 0 real_counter = 0 for row in reader: data_row = row[1:] data_row = map(float, data_row) exprSumPerGene = sum(data_row) counter += 1 if exprSumPerGene>=minExprSumPerGene: real_counter += 1 row_id_ls.append(row[0]) data_matrix.append(data_row) data_matrix = numpy.array(data_matrix) sys.stderr.write("%s rows out of %s selected. %s rows , %s columns.\n"%(real_counter, counter, \ len(row_id_ls), len(header)-1)) return PassingData(row_id_ls=row_id_ls, header=header, data_matrix=data_matrix)
def putQCIntoDB(self, session, input_fname, no_of_lines_to_skip, data_source_obj, cnv_type_obj, cnv_method_obj=None, \ run_type=1, original_id=None): """ 2009-10-28 """ sys.stderr.write("Putting QC data into database ... \n") reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) for i in range(no_of_lines_to_skip): reader.next() counter = 0 for row in reader: if run_type ==1: cnv_qc_call = self.generateCNVQCCallObjType1(session, row, data_source_obj, cnv_type_obj, cnv_method_obj) elif run_type ==2: cnv_qc_call = self.generateCNVQCCallObjType2(session, row, data_source_obj, cnv_type_obj, cnv_method_obj) elif run_type==3: cnv_qc_call = self.generateCNVQCCallObjFromBobSchmitzData(session, row, data_source_obj, cnv_type_obj,\ cnv_method_obj, original_id=original_id) else: sys.stderr.write("Run type %s not supported.\n"%run_type) session.save(cnv_qc_call) counter += 1 if counter%5000==0: sys.stderr.write("%s%s"%('\x08'*40, counter)) session.flush() sys.stderr.write("%s records. Done.\n"%counter)
def getMonkeyIDPair2Correlation(self, smartpcaCorrelationFname=None): """ 2012.3.1 smartpcaCorrelationFname is output from PCAOnVCFWorkflow.py (with modified smartpca). tab-delimited. 553_2_VRC_ref_GA_vs_524 555_15_1987079_GA_vs_524 Case Case 0.025 553_2_VRC_ref_GA_vs_524 556_16_1985088_GA_vs_524 Case Case -0.020 553_2_VRC_ref_GA_vs_524 557_17_1986014_GA_vs_524 Case Case -0.106 553_2_VRC_ref_GA_vs_524 558_18_1988009_GA_vs_524 Case Case -0.059 """ sys.stderr.write("Reading correlation from %s ... "%(smartpcaCorrelationFname)) monkey_id_pair2genotype_correlation = {} import csv reader = csv.reader(open(smartpcaCorrelationFname), delimiter=figureOutDelimiter(smartpcaCorrelationFname)) monkey_id_extract = lambda x: x.split('_')[2] for row in reader: monkey1 = row[0] monkey2 = row[1] cor = float(row[4]) pair_in_ls = [monkey_id_extract(monkey1), monkey_id_extract(monkey2)] pair_in_ls.sort() pair_key = tuple(pair_in_ls) monkey_id_pair2genotype_correlation[pair_key] = cor sys.stderr.write("%s pairs .\n"%(len(monkey_id_pair2genotype_correlation))) return monkey_id_pair2genotype_correlation
def readInput(self, inputFnameLs, ): sys.stderr.write("Reading distance data from %s files ..."%(len(inputFnameLs))) sampleId2index = {} samplePair2data = {} #value is [no_of_mismatches, no_of_total_non_NA] for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname, ), delimiter=figureOutDelimiter(inputFname)) matrixStart = False for row in reader: if row[0]=='': matrixStart = True break sample1Id = row[0] if sample1Id not in sampleId2index: sampleId2index[sample1Id] = len(sampleId2index) sample2Id = row[1] if sample2Id not in sampleId2index: sampleId2index[sample2Id] = len(sampleId2index) no_of_mismatches = float(row[-2]) no_of_total_non_NA = float(row[-1]) samplePair = (sample1Id, sample2Id) if samplePair not in samplePair2data: samplePair2data[samplePair] = [0, 0] samplePair2data[samplePair][0] += no_of_mismatches samplePair2data[samplePair][1] += no_of_total_non_NA del reader sys.stderr.write("Done.\n") return sampleId2index, samplePair2data
def trioInconsistentRateFileWalker(self, inputFname, processFunc=None, minNoOfTotal=100, run_type=1): """ 2011-11-2 remove the maxDepth filter. apply afterwards through filterDataByDepth(). 2011-9-30 """ reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) isInconsistent_index = col_name2index.get("isInconsistent") index_of_fa_depth = col_name2index.get("depthOfFather") index_of_mo_depth = col_name2index.get('depthOfMother') index_of_child_depth = col_name2index.get('depthOfChild') for row in reader: fa_depth = int(float(row[index_of_fa_depth])) mo_depth = int(float(row[index_of_mo_depth])) child_depth = int(float(row[index_of_child_depth])) isInconsistent = float(float(row[isInconsistent_index])) #if fa_depth<=self.maxDepth and mo_depth <=self.maxDepth and child_depth<=self.maxDepth: self.fa_depth_ls.append(fa_depth) self.mo_depth_ls.append(mo_depth) self.child_depth_ls.append(child_depth) self.inconsistent_ls.append(isInconsistent) del reader
def run(self): """ 2012.4.3 each input has this as its header: ['alignmentID', 'total_no_of_reads', 'perc_reads_mapped', 'perc_duplicates', 'perc_paired', 'perc_properly_paired', \ 'perc_both_mates_mapped', 'perc_singletons',\ 'perc_mapped_to_diff_chrs'] """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() no_of_total_lines = 0 for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() colName2Index = utils.getColName2IndexFromHeader(header, skipEmptyColumn=True) alignment_id_index = colName2Index.get('alignmentID') total_no_of_reads_index = colName2Index.get('total_no_of_reads') perc_reads_mapped_index = colName2Index.get("perc_reads_mapped") perc_duplicates_index = colName2Index.get("perc_duplicates") perc_paired_index = colName2Index.get("perc_paired") perc_properly_paired_index = colName2Index.get("perc_properly_paired") perc_both_mates_mapped_index = colName2Index.get("perc_both_mates_mapped") perc_singletons_index = colName2Index.get("perc_singletons") perc_mapped_to_diff_chrs_index = colName2Index.get("perc_mapped_to_diff_chrs") perc_mapq5_mapped_to_diff_chrs_index = colName2Index.get("perc_mapq5_mapped_to_diff_chrs") for row in reader: alignmentID = int(row[alignment_id_index]) alignment = VervetDB.IndividualAlignment.get(alignmentID) alignment.perc_reads_mapped = float(row[perc_reads_mapped_index]) alignment.perc_duplicates = float(row[perc_duplicates_index]) alignment.perc_paired = float(row[perc_paired_index]) alignment.perc_properly_paired = float(row[perc_properly_paired_index]) alignment.perc_both_mates_mapped = float(row[perc_both_mates_mapped_index]) alignment.perc_singletons = float(row[perc_singletons_index]) alignment.perc_mapped_to_diff_chrs = float(row[perc_mapped_to_diff_chrs_index]) alignment.perc_mapq5_mapped_to_diff_chrs = float(row[perc_mapq5_mapped_to_diff_chrs_index]) alignment.total_no_of_reads = int(float(row[total_no_of_reads_index])) session.add(alignment) no_of_total_lines += 1 del reader sys.stderr.write("%s alignments in total.\n"%(no_of_total_lines)) if self.logFilename: logF = open(self.logFilename, 'w') logF.write("%s alignments in total.\n"%(no_of_total_lines)) del logF if self.commit: self.db_vervet.session.flush() self.db_vervet.session.commit()
def run(self): """ 2007-02-27 2007-09-14 filtering_bits -read_data() -remove_rows_with_too_many_NAs() -remove_cols_with_too_many_NAs() -remove_identity_strains() -write_data_matrix() """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter ) data_matrix = num.array(data_matrix) if self.filtering_bits[0] == "1": remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs else: rows_with_too_many_NAs_set = Set() if self.filtering_bits[1] == "1": remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set) cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set else: cols_with_too_many_NAs_set = Set() if self.filtering_bits[2] == "1": no_of_rows, no_of_cols = data_matrix.shape total_rows_set = Set(range(no_of_rows)) rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set total_cols_set = Set(range(no_of_cols)) cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set identity_strains_to_be_removed = self.remove_identity_strains( data_matrix, rows_to_be_checked, cols_to_be_checked ) else: identity_strains_to_be_removed = Set() rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed # self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1])) write_data_matrix( data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter, )
def read_input_fname(self, input_fname): sys.stderr.write("Getting gene pairs from %s ..."%input_fname) reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) gene_id_pair_ls = [] for row in reader: gene1_id = int(row[0]) gene2_id = int(row[1]) gene_id_pair_ls.append((gene1_id, gene2_id)) sys.stderr.write("Done.\n") return gene_id_pair_ls
def putHaplotypeGroupIntoDB(self, session, input_fname, tg_ecotypeid2row, max_snp_typing_error_rate, snp_id_ls): """ 2009-3-31 2009-4-4 add argument tg_ecotypeid2row """ sys.stderr.write("Constructing haplotype groups ...\n") pattern_ecotypeid = re.compile(r'(?<=\))\d+') reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) col_name2col_index = getColName2IndexFromHeader(reader.next()) ecotypeid_idx = col_name2col_index['ecotypeid'] haplo_name_idx = col_name2col_index['haplogroup'] geographic_integrity_idx = col_name2col_index['geographic_integrity'] filtered_SNPs_idx = col_name2col_index['filtered_SNPs'] counter = 0 for tg_ecotypeid, row in tg_ecotypeid2row.iteritems(): ecotypeid = int(row[ecotypeid_idx]) ecotypeid = tg_ecotypeid #2009-4-4 use tg_ecotypeid instead haplo_name = row[haplo_name_idx] geographic_integrity_name = row[geographic_integrity_idx] filtered_SNPs = row[filtered_SNPs_idx] ref_ecotypeid = int(pattern_ecotypeid.search(haplo_name).group(0)) haplo_group = StockDB.HaploGroup.query.filter_by(short_name=haplo_name).first() if not haplo_group: haplo_group = StockDB.HaploGroup(short_name=haplo_name, ref_ecotypeid=ref_ecotypeid, max_snp_typing_error_rate=max_snp_typing_error_rate) session.save(haplo_group) session.flush() ecotype = StockDB.Ecotype.get(ecotypeid) haplo_group.ecotypes.append(ecotype) geographic_integrity = StockDB.GeographicIntegrity.query.filter_by(short_name=geographic_integrity_name).first() if not geographic_integrity: geographic_integrity = StockDB.GeographicIntegrity(short_name=geographic_integrity_name) session.save(geographic_integrity) session.flush() ecotype.geographic_integrity = geographic_integrity session.save_or_update(ecotype) #one bit of ecotype: link the ecotypeid to tg_ecotype_id #deal with filtered SNPs for i in range(len(filtered_SNPs)): allele = filtered_SNPs[i] if allele=='_': continue fc = StockDB.FilteredCalls(ecotypeid=ecotypeid, snpid=snp_id_ls[i], allele=allele) session.save(fc) session.flush() counter += 1 if counter%500==0 and self.report: sys.stderr.write('%s%s'%('\x08'*80, counter)) session.flush() sys.stderr.write("Done.\n")
def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() #database connection and etc db = self.db_250k session = db.session session.begin() delimiter = figureOutDelimiter(self.inputFname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.inputFname, delimiter=delimiter) if self.snp_id_type == 1: #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs. # but if col-id is already chr_pos, it's fine. new_header = header[:2] data_matrix_col_index_to_be_kept = [] for i in xrange(2, len(header)): snp_id = header[i] chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, ) if chr_pos is not None: data_matrix_col_index_to_be_kept.append(i - 2) new_header.append(chr_pos) # to remove no-db_id columns from data matrix data_matrix = numpy.array(data_matrix) data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept] header = new_header if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [ rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls ] snpsdata.writeRawSnpsDatasToFile(self.outputFname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def read_input_fname(self, input_fname): sys.stderr.write("Getting gene pairs from %s ..."%input_fname) reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) gene_id_pair_ls = [] gene_id_set = Set() for row in reader: gene1_id = int(row[0]) gene2_id = int(row[1]) gene_id_pair_ls.append((gene1_id, gene2_id)) gene_id_set.add(gene1_id) gene_id_set.add(gene2_id) sys.stderr.write("Done.\n") return gene_id_pair_ls, gene_id_set
def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords): """ 2009-2-12 use yield to become a generator 2008-11-25 read gene ids from gene_id_fname and generate pairs among them each node handle a number of pairs, depending on many SNP pairs it incurs """ #sys.stderr.write("Generating parameters ...") #params_ls = [] no_of_phenotypes = len(pdata.phenotype_index_ls) start_index = 0 #for each computing node: the index of gene >= start_index #no_of_genes = len(pdata.gene_id2snps_id_ls) no_of_tests_per_node = 0 reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname)) gene_id_ls = [] for row in reader: gene_id = int(row[0]) gene_id_ls.append(gene_id) del reader no_of_genes = len(gene_id_ls) gene_id_pairs_for_each_node = [] for i in range(no_of_genes): gene1_id = gene_id_ls[i] n1 = len( pdata.gene_id2snps_id_ls[gene1_id]) #no_of_snps_of_this_gene #for gene2_id in pdata.gene_id2snps_id_ls: #2009-2-8 another setting: gene_id_fname vs all genes for j in range(i + 1, no_of_genes): gene2_id = gene_id_ls[j] n2 = len(pdata.gene_id2snps_id_ls[gene2_id]) est_no_of_tests = ( n1 * n2 ) * no_of_phenotypes #this is the upper bound for the number of tests for each gene on a computing node. data missing would make the number smaller. no_of_tests_per_node += est_no_of_tests gene_id_pairs_for_each_node.append((gene1_id, gene2_id)) if no_of_tests_per_node >= block_size: yield gene_id_pairs_for_each_node #reset gene_id_pairs_for_each_node gene_id_pairs_for_each_node = [] no_of_tests_per_node = 0 #reset this to 0 #pick up the last gene_id_pairs_for_each_node if it's not empty if gene_id_pairs_for_each_node: yield gene_id_pairs_for_each_node
def get_snp_pair2value_type(self, boolean_pair_fname): """ 2008-11-25 """ sys.stderr.write("Getting snp_pair2value_type ...") snp_pair2value_type = {} reader = csv.reader(open(boolean_pair_fname), delimiter=figureOutDelimiter(boolean_pair_fname)) reader.next() min_value = None max_value = None for row in reader: snp1_id, gene1_id, snp2_id, gene2_id, bool_type, pvalue, count1, count2 = row if not snp2_id: snp2_id = snp1_id continue #2008-11-26 skip a row if it's pvalue from single SNP. if not gene2_id: gene2_id = gene1_id snp1_id = snp1_id.split('_') snp1_id = map(int, snp1_id) snp2_id = snp2_id.split('_') snp2_id = map(int, snp2_id) snp_pair = [tuple(snp1_id), tuple(snp2_id)] snp_pair.sort() snp_pair = tuple(snp_pair) pvalue = -math.log10(float(pvalue)) value = pvalue if min_value is None: min_value =value elif value<min_value: min_value = value if max_value is None: max_value = value elif value>max_value: max_value = value bool_type = int(bool_type) if snp_pair not in snp_pair2value_type: snp_pair2value_type[snp_pair] = (pvalue, bool_type) else: if pvalue>snp_pair2value_type[snp_pair][0]: #only take maximum snp_pair2value_type[snp_pair] = (pvalue, bool_type) del reader sys.stderr.write("Done.\n") return snp_pair2value_type, min_value, max_value
def getNoOfLociFromSNPData(self, inputFname): """ 2012.3.2 """ sys.stderr.write("Getting no of loci from %s ..." % (os.path.basename(inputFname))) reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() first_data_row = reader.next() no_of_cols = len(first_data_row) - 2 del reader sys.stderr.write("%s columns.\n" % (no_of_cols)) return no_of_cols
def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords): """ 2009-2-12 use yield make this function a generator 2009-2-9 add argument gene_id_fname to restrict analysis on genes from it 2008-09-09 estimate the number of tests each gene would encompass, and decide how many genes should be included in a set to send out 2008-09-06 each node handles a certain number of genes. identified by the index of the 1st gene and the index of the last gene. """ no_of_phenotypes = len(pdata.phenotype_index_ls) start_index = 0 #for each computing node: the index of gene >= start_index no_of_tests_per_node = 0 #2009-2-9 if gene_id_fname and os.path.isfile(gene_id_fname): reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname)) gene_id_ls = [] for row in reader: gene_id = int(row[0]) gene_id_ls.append(gene_id) del reader pdata.gene_id_ls = gene_id_ls #replace pdata's gene_id_ls no_of_genes = len(pdata.gene_id_ls) for i in range(no_of_genes): gene_id = pdata.gene_id_ls[i] n = len( pdata.gene_id2snps_id_ls[gene_id]) #no_of_snps_of_this_gene est_no_of_tests = ( n * (n - 1) * 5 / 2.0 + n ) * no_of_phenotypes #this is the upper bound for the number of tests for each gene on a computing node. data missing would make the number smaller. no_of_tests_per_node += est_no_of_tests if no_of_tests_per_node >= block_size: yield ( start_index, i + 1 ) #the computing node is gonna handle genes from pdata.gene_id_ls[start_index] to pdata.gene_id_ls[i] #reset the starting pointer to the index of the next gene start_index = i + 1 no_of_tests_per_node = 0 #reset this to 0 elif i == no_of_genes - 1: #this is the last gene, have to include them yield (start_index, i + 1)
def getSampleID2FamilyCount(self, inputFname): """ 2012.3.29 """ sys.stderr.write("Getting sampleID2FamilyCount from %s ..."%(inputFname)) reader = csv.reader(open(inputFname, 'r'), delimiter=figureOutDelimiter(inputFname)) header = reader.next() colName2Index = getColName2IndexFromHeader(header) sampleID2FamilyCount = {} for row in reader: individualID = row[colName2Index.get("individualID")] familyCount = int(row[colName2Index.get("familyCount")]) sampleID2FamilyCount[individualID] = familyCount sys.stderr.write("%s individuals.\n"%(len(sampleID2FamilyCount))) return sampleID2FamilyCount
def get_isqID2coverage(self, seqCoverageFname, defaultCoverage=None): """ 2011-9-2 """ sys.stderr.write("Fetching sequence coverage info from %s ..."%(seqCoverageFname)) reader = csv.reader(open(seqCoverageFname, 'r'), delimiter=figureOutDelimiter(seqCoverageFname)) isqID2coverage = {} header = reader.next() for row in reader: isqID = int(row[0]) coverage = float(row[1]) isqID2coverage[isqID] = coverage sys.stderr.write("%s entries.\n"%len(isqID2coverage)) return isqID2coverage
def dropRedundantEcotypes(self, input_fname, ecotypeid2tg_ecotypeid): """ 2009-4-10 not used. decided to keep all of them. 2009-4-4 retain only one row out of duplicated ecotype rows based on ecotypeid2tg_ecotypeid. it's not random. usually the one with same ecotype id as tg_ecotypeid unless tg_ecotypeid doesn't appear. if duplicated ecotypes belong to different haplotype group, choose the one with tg_ecotypeid otherwise random. """ sys.stderr.write("Dropping redundant ecotypes ...\n") reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) col_name2col_index = getColName2IndexFromHeader(reader.next()) ecotypeid_idx = col_name2col_index['ecotypeid'] haplo_name_idx = col_name2col_index['haplogroup'] nativename_idx = col_name2col_index['nativename'] tg_ecotypeid2row = {} no_of_duplicates = 0 no_of_duplicates_with_different_haplogroups = 0 counter = 0 for row in reader: ecotypeid = int(row[ecotypeid_idx]) haplo_name = row[haplo_name_idx] nativename = row[nativename_idx] if ecotypeid in ecotypeid2tg_ecotypeid: tg_ecotypeid = ecotypeid2tg_ecotypeid[ecotypeid] if tg_ecotypeid not in tg_ecotypeid2row: tg_ecotypeid2row[tg_ecotypeid] = row else: no_of_duplicates += 1 old_row = tg_ecotypeid2row[tg_ecotypeid] old_ecotypeid = int(old_row[ecotypeid_idx]) old_haplo_name = old_row[haplo_name_idx] old_nativename = row[nativename_idx] if old_haplo_name != haplo_name: sys.stderr.write("ecotype %s(%s) in haplotype group %s, while duplicate %s(%s) in haplotype group %s.\n"%\ (ecotypeid, nativename, haplo_name, old_ecotypeid, old_nativename, old_haplo_name)) no_of_duplicates_with_different_haplogroups += 1 if ecotypeid == tg_ecotypeid: #replace if the new ecotypeid matching the tg_ecotypeid whether the haplotype group is same or not. tg_ecotypeid2row[tg_ecotypeid] = row else: sys.stderr.write( "Warning: ecotype %s not in ecotypeid2tg_ecotypeid.\n" % (ecotypeid)) counter += 1 sys.stderr.write("no_of_duplicates: %s, out of which %s encompass different haplotype groups. %s accessions in total. Done.\n"%\ (no_of_duplicates, no_of_duplicates_with_different_haplogroups, counter)) return tg_ecotypeid2row
def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords): """ 2009-2-18 if gene_id_fname is given and is a file: yield (gene1_id, snp_start_index, snp_stop_index) else: yield (phenotype_index, snp_start_index1, snp_stop_index1, snp_start_index2, snp_stop_index2) 2009-2-12 use yield to become a generator called by inputNodePrepare() 2009-1-22 """ no_of_phenotypes = len(pdata.phenotype_index_ls) start_index = 0 #for each computing node: the index of gene >= start_index #no_of_genes = len(pdata.gene_id2snps_id_ls) no_of_tests_per_node = 0 if gene_id_fname and os.path.isfile(gene_id_fname): reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname)) gene_id_ls = [] for row in reader: gene_id = int(row[0]) gene_id_ls.append(gene_id) del reader no_of_genes = len(gene_id_ls) no_of_total_snps = len(pdata.snp_info.chr_pos_ls) for i in range(no_of_genes): gene1_id = gene_id_ls[i] n1 = len(pdata.gene_id2snps_id_ls[gene1_id]) #no_of_snps_of_this_gene snp_start_index = 0 while snp_start_index < no_of_total_snps: no_of_snps_to_consider = block_size/(n1*no_of_phenotypes) snp_stop_index = snp_start_index+no_of_snps_to_consider if snp_stop_index > no_of_total_snps: snp_stop_index = no_of_total_snps yield (gene1_id, snp_start_index, snp_stop_index) snp_start_index += no_of_snps_to_consider else: #no gene_id_fname. pairwise among all SNPs no_of_snps_to_consider = int(math.sqrt(block_size)) no_of_total_snps = len(pdata.snp_info.chr_pos_ls) for phenotype_index in pdata.phenotype_index_ls: for snp_start_index1 in range(0, no_of_total_snps, no_of_snps_to_consider): snp_stop_index1 = min(no_of_total_snps, snp_start_index1+no_of_snps_to_consider) for snp_start_index2 in range(snp_start_index1, no_of_total_snps, no_of_snps_to_consider): snp_stop_index2 = min(no_of_total_snps, snp_start_index2+no_of_snps_to_consider) yield (phenotype_index, snp_start_index1, snp_stop_index1, snp_start_index2, snp_stop_index2)
def run(self): """ 2008-09-10 in case chop the whole figure into blocks, swap col_block_index and row_block_index to make row first, column 2nd """ from pymodule.yhio.SNP import read_data from pymodule.utils import figureOutDelimiter, PassingData delimiter = figureOutDelimiter(self.input_fname) print delimiter header, row_label_ls1, row_label_ls2, data_matrix = read_data(self.input_fname, matrix_data_type=float, delimiter='\t') import numpy data_matrix = numpy.array(data_matrix) min_value = numpy.min(data_matrix) if self.min_value_non_negative and min_value < 0: min_value = 0 max_value = numpy.max(data_matrix) font = get_font(self.font_path, font_size=self.font_size) Value2Color.special_value2color[-2] = self.super_value_color value2color_func = lambda x: Value2Color.value2HSLcolor(x, min_value, max_value) im_legend = drawContinousLegend(min_value, max_value, self.no_of_ticks, value2color_func, font) fig_fname_prefix = os.path.splitext(self.fig_fname)[0] if self.split_legend_and_matrix: im_legend.save('%s_legend.png'%fig_fname_prefix) no_of_rows, no_of_cols = data_matrix.shape passParam = PassingData(value2color_func=value2color_func, im_legend=im_legend, font=font, \ split_legend_and_matrix=self.split_legend_and_matrix, no_grid=self.no_grid) if no_of_cols <= self.blockColUnit: self._drawMatrix(data_matrix, row_label_ls1, header[2:], self.fig_fname, passParam) else: #split into blocks no_of_col_blocks = no_of_cols/self.blockColUnit+1 no_of_row_blocks = no_of_rows/self.blockRowUnit + 1 for i in range(no_of_col_blocks): col_start_index = i*self.blockColUnit col_end_index = (i+1)*self.blockColUnit if col_start_index<no_of_cols: for j in range(no_of_row_blocks): row_start_index = j*self.blockRowUnit row_end_index = (j+1)*self.blockRowUnit if row_start_index<no_of_rows: fig_fname = '%s_%s_%s.png'%(fig_fname_prefix, j, i) #row first, column 2nd self._drawMatrix(data_matrix[row_start_index:row_end_index,col_start_index:col_end_index], row_label_ls1[row_start_index:row_end_index], \ header[2+col_start_index:2+col_end_index], fig_fname, passParam)
def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords): """ 2009-2-12 use yield to become a generator 2008-11-25 read gene ids from gene_id_fname and generate pairs among them each node handle a number of pairs, depending on many SNP pairs it incurs """ #sys.stderr.write("Generating parameters ...") #params_ls = [] no_of_phenotypes = len(pdata.phenotype_index_ls) start_index = 0 #for each computing node: the index of gene >= start_index #no_of_genes = len(pdata.gene_id2snps_id_ls) no_of_tests_per_node = 0 reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname)) gene_id_ls = [] for row in reader: gene_id = int(row[0]) gene_id_ls.append(gene_id) del reader no_of_genes = len(gene_id_ls) gene_id_pairs_for_each_node = [] for i in range(no_of_genes): gene1_id = gene_id_ls[i] n1 = len(pdata.gene_id2snps_id_ls[gene1_id]) #no_of_snps_of_this_gene #for gene2_id in pdata.gene_id2snps_id_ls: #2009-2-8 another setting: gene_id_fname vs all genes for j in range(i+1, no_of_genes): gene2_id = gene_id_ls[j] n2 = len(pdata.gene_id2snps_id_ls[gene2_id]) est_no_of_tests = (n1*n2)*no_of_phenotypes #this is the upper bound for the number of tests for each gene on a computing node. data missing would make the number smaller. no_of_tests_per_node += est_no_of_tests gene_id_pairs_for_each_node.append((gene1_id, gene2_id)) if no_of_tests_per_node>=block_size: yield gene_id_pairs_for_each_node #reset gene_id_pairs_for_each_node gene_id_pairs_for_each_node = [] no_of_tests_per_node = 0 #reset this to 0 #pick up the last gene_id_pairs_for_each_node if it's not empty if gene_id_pairs_for_each_node: yield gene_id_pairs_for_each_node
def dropRedundantEcotypes(self, input_fname, ecotypeid2tg_ecotypeid): """ 2009-4-4 retain only one row out of duplicated ecotype rows based on ecotypeid2tg_ecotypeid. it's not random. usually the one with same ecotype id as tg_ecotypeid unless tg_ecotypeid doesn't appear. if duplicated ecotypes belong to different haplotype group, choose the one with tg_ecotypeid otherwise random. """ sys.stderr.write("Dropping redundant ecotypes ...\n") reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) col_name2col_index = getColName2IndexFromHeader(reader.next()) ecotypeid_idx = col_name2col_index['ecotypeid'] haplo_name_idx = col_name2col_index['haplogroup'] nativename_idx = col_name2col_index['nativename'] tg_ecotypeid2row = {} no_of_duplicates = 0 no_of_duplicates_with_different_haplogroups = 0 counter = 0 for row in reader: ecotypeid = int(row[ecotypeid_idx]) haplo_name = row[haplo_name_idx] nativename = row[nativename_idx] if ecotypeid in ecotypeid2tg_ecotypeid: tg_ecotypeid = ecotypeid2tg_ecotypeid[ecotypeid] if tg_ecotypeid not in tg_ecotypeid2row: tg_ecotypeid2row[tg_ecotypeid] = row else: no_of_duplicates += 1 old_row = tg_ecotypeid2row[tg_ecotypeid] old_ecotypeid = int(old_row[ecotypeid_idx]) old_haplo_name = old_row[haplo_name_idx] old_nativename = row[nativename_idx] if old_haplo_name!=haplo_name: sys.stderr.write("ecotype %s(%s) in haplotype group %s, while duplicate %s(%s) in haplotype group %s.\n"%\ (ecotypeid, nativename, haplo_name, old_ecotypeid, old_nativename, old_haplo_name)) no_of_duplicates_with_different_haplogroups += 1 if ecotypeid==tg_ecotypeid: #replace if the new ecotypeid matching the tg_ecotypeid whether the haplotype group is same or not. tg_ecotypeid2row[tg_ecotypeid] = row else: sys.stderr.write("Warning: ecotype %s not in ecotypeid2tg_ecotypeid.\n"%(ecotypeid)) counter += 1 sys.stderr.write("no_of_duplicates: %s, out of which %s encompass different haplotype groups. %s accessions in total. Done.\n"%\ (no_of_duplicates, no_of_duplicates_with_different_haplogroups, counter)) return tg_ecotypeid2row
def run(self): """ 2008-9-7 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report) if self.mapping_fname: #output allele_index2allele_ls self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname) newSnpData.tofile(self.output_fname)
def readInRefArrayData(self, input_fname, ref_array_id_set=None): """ 2010-5-25 """ sys.stderr.write("Getting data matrix for reference arrays.\n") reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) for i in xrange(3): # skip first 3 rows reader.next() data_matrix = [] for row in reader: array_id = int(row[0]) if array_id in ref_array_id_set: data_matrix.append(map(float, row[2:])) del reader data_matrix = numpy.array(data_matrix) sys.stderr.write("%s arrays, %s probes. Done.\n" % (data_matrix.shape[0], data_matrix.shape[1])) return data_matrix
def run(self): """ """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname, openMode='r') reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) header = None for i in xrange(self.noOfLinesInHeader): if i==0: header = reader.next() #first line is taken as header else: reader.next() if header is not None: colName2Index = getColName2IndexFromHeader(header) newHeader = ['alignmentID', 'total_base_count', 'sampled_base_count', 'meanDepth', 'medianDepth', 'modeDepth'] inputStatLs = [] writer = csv.writer(utils.openGzipFile(self.outputFname, openMode='w'), delimiter='\t') writer.writerow(newHeader) counter = 0 real_counter = 0 for row in reader: counter += 1 if real_counter <= self.maxNumberOfSamplings: r = random.random() if r<=self.fractionToSample and real_counter<=self.maxNumberOfSamplings: inputStatLs.append(float(row[self.whichColumn])) real_counter += 1 meanDepth = numpy.mean(inputStatLs) medianDepth = numpy.median(inputStatLs) modeDepth = scipy.stats.mode(inputStatLs)[0][0] outputRow = [self.alignmentID, counter, real_counter, meanDepth, medianDepth, modeDepth] writer.writerow(outputRow) del writer
def get_snp_region_ls(self, ft_region_fname, snp_info, min_distance=5000): sys.stderr.write("Get all snp regions ...") delimiter = figureOutDelimiter(ft_region_fname) ft_region_reader = csv.reader(open(ft_region_fname, 'r'), delimiter=delimiter) snp_region_ls = [] for row in ft_region_reader: row = map(int, row) chr1, pos1, chr2, pos2 = row if pos2<pos1: pos1, pos2 = pos2, pos1 span = abs(pos2-pos1) if span < min_distance*2: extra_span = (min_distance*2-span)/2 pos1 = max(pos1 - extra_span, 1) pos2 = pos2+extra_span snp_region = self.findSNPsInRegion(snp_info, chr1, pos1, pos2, center_snp_position=None) snp_region_ls.append(snp_region) del ft_region_reader sys.stderr.write("Done.\n") return snp_region_ls
def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls] snpsdata.writeRawSnpsDatasToFile(self.output_fname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def run(self): """ 2007-02-27 2007-09-14 filtering_bits -read_data() -remove_rows_with_too_many_NAs() -remove_cols_with_too_many_NAs() -remove_identity_strains() -write_data_matrix() """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter) data_matrix = numpy.array(data_matrix) if self.filtering_bits[0]=='1': remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs else: rows_with_too_many_NAs_set = set() if self.filtering_bits[1]=='1': remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set) cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set else: cols_with_too_many_NAs_set = set() if self.filtering_bits[2]=='1': no_of_rows, no_of_cols = data_matrix.shape total_rows_set = set(range(no_of_rows)) rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set total_cols_set = set(range(no_of_cols)) cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set identity_strains_to_be_removed = self.remove_identity_strains(data_matrix, rows_to_be_checked, cols_to_be_checked) else: identity_strains_to_be_removed = set() rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed #self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1])) write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter)
def run(self): if self.debug: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() if self.ecotype_duplicate2tg_ecotypeid_table: ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid( curs, self.ecotype_duplicate2tg_ecotypeid_table) else: ecotype_duplicate2tg_ecotypeid = None from pymodule import figureOutDelimiter delimiter = figureOutDelimiter(self.input_fname) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls( strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid) ecotypeid2nativename = get_ecotypeid2nativename( curs, ecotype_table=self.ecotype_table) tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \ ecotypeid2nativename, self.stat_output_fname) tg_nativename_ls = [] for ecotypeid in tg_ecotypeid_ls: tg_nativename_ls.append(ecotypeid2nativename[ecotypeid]) header[1] = 'nativename' write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
def trioInconsistentRateFileWalker(cls, inputFname, processFunc=None, minNoOfTotal=100, run_type=1): """ 2012.10.25 only skip except during file opening, not file reading 2011-9-30 """ try: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() return inconsistent_rate_index = col_name2index.get("inconsistency") if run_type==1: index_of_x_data = col_name2index.get("stopFrequency") elif run_type==2: index_of_x_data = col_name2index.get("stop") else: sys.stderr.write("Unsupported run_type %s in trioInconsistentRateFileWalker().\n"%(run_type)) sys.exit(3) index_of_no_of_total = col_name2index.get("no_of_total") inconsistent_rate_ls = [] x_ls = [] for row in reader: if self.samplingRate<1 and self.samplingRate>=0: r = random.random() if r>self.samplingRate: continue no_of_total = int(float(row[index_of_no_of_total])) if no_of_total<=minNoOfTotal: continue inconsistency = float(row[inconsistent_rate_index]) inconsistent_rate_ls.append(inconsistency) x_data = float(row[index_of_x_data]) x_ls.append(x_data) processFunc(x_ls, inconsistent_rate_ls) del reader
def generate_params(self, gene_id_fname, pdata, block_size=1000, **keywords): """ 2009-2-12 use yield make this function a generator 2009-2-9 add argument gene_id_fname to restrict analysis on genes from it 2008-09-09 estimate the number of tests each gene would encompass, and decide how many genes should be included in a set to send out 2008-09-06 each node handles a certain number of genes. identified by the index of the 1st gene and the index of the last gene. """ no_of_phenotypes = len(pdata.phenotype_index_ls) start_index = 0 #for each computing node: the index of gene >= start_index no_of_tests_per_node = 0 #2009-2-9 if gene_id_fname and os.path.isfile(gene_id_fname): reader = csv.reader(open(gene_id_fname), delimiter=figureOutDelimiter(gene_id_fname)) gene_id_ls = [] for row in reader: gene_id = int(row[0]) gene_id_ls.append(gene_id) del reader pdata.gene_id_ls = gene_id_ls #replace pdata's gene_id_ls no_of_genes = len(pdata.gene_id_ls) for i in range(no_of_genes): gene_id = pdata.gene_id_ls[i] n = len(pdata.gene_id2snps_id_ls[gene_id]) #no_of_snps_of_this_gene est_no_of_tests = (n*(n-1)*5/2.0 + n)*no_of_phenotypes #this is the upper bound for the number of tests for each gene on a computing node. data missing would make the number smaller. no_of_tests_per_node += est_no_of_tests if no_of_tests_per_node>=block_size: yield (start_index, i+1) #the computing node is gonna handle genes from pdata.gene_id_ls[start_index] to pdata.gene_id_ls[i] #reset the starting pointer to the index of the next gene start_index = i+1 no_of_tests_per_node = 0 #reset this to 0 elif i==no_of_genes-1: #this is the last gene, have to include them yield (start_index, i+1)
def run(self): """ 2008-9-7 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.mapping_fname: #output allele_index2allele_ls self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname) newSnpData.tofile(self.output_fname)
def run(self): if self.debug: import pdb pdb.set_trace() try: inf = utils.openGzipFile(self.inputFname) delimiter = figureOutDelimiter(inf) if not delimiter: delimiter='\t' reader = csv.reader(inf, delimiter=delimiter) writer = csv.writer(open(self.outputFname, 'w'), delimiter=delimiter) extendHeader = [] if self.addChrName: extendHeader.append(self.chrHeader) extendHeader.append(self.chrLengthHeader) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() print sys.exc_info() sys.exit(0) try: header = self.processHeader(reader=reader, extendHeader=extendHeader, chrLengthHeader = self.chrLengthHeader) writer.writerow(header) for row in reader: new_data_row = self.processRow(row) writer.writerow(new_data_row) del reader del writer except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() print sys.exc_info() sys.exit(0)
def get_input(cls, input_fname, data_type=numpy.float32): """ 2009-10-28 switch the default data_type to numpy.float32 to save memory on 64bit machines 2009-9-28 add argument data_type to specify data type of data_matrix. default is numpy.float (numpy.float could be float32, float64, float128 depending on the architecture). numpy.double is also fine. 2009-5-18 become classmethod """ sys.stderr.write("Getting input from %s ..."%input_fname) reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) commandline = 'wc -l %s'%input_fname command_handler = subprocess.Popen(commandline, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) stdout_content, stderr_content = command_handler.communicate() if stderr_content: sys.stderr.write('stderr of %s: %s \n'%(commandline, stderr_content)) no_of_rows = int(stdout_content.split()[0])-1 header = reader.next() no_of_cols = len(header)-3 data_matrix = numpy.zeros([no_of_rows, no_of_cols], data_type) probe_id_ls = [] chr_pos_ls = [] i=0 for row in reader: probe_id = row[0] probe_id_ls.append(probe_id) chr_pos_ls.append(row[-2:]) for j in range(1, 1+no_of_cols): data_matrix[i][j-1] = float(row[j]) i += 1 sys.stderr.write("Done.\n") return data_matrix, probe_id_ls, chr_pos_ls, header
def __init__(self, inputFname=None, **keywords): self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, \ class_to_have_attr=self) if not self.inputFname: self.inputFname = inputFname if self.inputFname and self.inputFile is None: self.inputFile = utils.openGzipFile(self.inputFname, openMode=self.openMode) self.filename = self.inputFname #2013.05.03 for easy access self.csvFile = None self.isRealCSV = False if self.openMode=='r': #reading mode if self.delimiter is None: self.delimiter = figureOutDelimiter(self.inputFile) if self.delimiter=='\t' or self.delimiter==',': self.csvFile = csv.reader(self.inputFile, delimiter=self.delimiter) self.isRealCSV = True else: self.csvFile = self.inputFile self.isRealCSV = False else: #writing mode if not self.delimiter: self.delimiter = '\t' self.csvFile = csv.writer(self.inputFile, delimiter=self.delimiter) self.isRealCSV = True #else: # self.csvFile = self.inputFile # self.isRealCSV = False self.col_name2index = None self._row = None #2013.08.30 to store the current row being read self.headerPattern = re.compile(r'^[a-zA-Z]') #default header pattern, line beginned with letter self.commentPattern = re.compile(r'^#') #default, beginned with # self.comment_row_list = []
def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False): """ 2009-10-18 If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID. 2009-2-4 use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set 2008-01-08 add option skip_1st_line stop using csv.reader, use raw file handler instead figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column. 2008-12-11 more filtering: 1. strip the original_name 2. pick alphanumeric characters out of original_name if GeneListType is already in db. check if GeneList has this gene already or not. 2008-11-20 use figureOutDelimiter() to get delimiter automatically 2008-07-15 if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry. 2008-07-15 use gene_id2original_name to avoid redundancy in gene list """ import csv, sys, os session = db.session delimiter=figureOutDelimiter(input_fname) inf = open(input_fname) #2008-11-20 if skip_1st_line: inf.next() #skips the 1st line counter = 0 success_counter = 0 gene_id2original_name = {} #to avoid redundancy in gene list for line in inf: if line=='\n': #skip empty lines continue row = line.split(delimiter) original_name = row[0].strip() #2008-12-11 remove spaces/tabs in the beginning/end all_number_p_search_result = self.all_number_p.search(original_name) if all_number_p_search_result: # 2009-10-18 original_name is full of numbers. a legitimate Gene ID. ecotypeid = int(all_number_p_search_result.group(0)) gene_id_set = set([ecotypeid]) else: gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set) if gene_id_set==None: sys.stderr.write("Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n"%(original_name)) elif len(gene_id_set)==1: gene_id = list(gene_id_set)[0] if gene_id not in gene_id2original_name: gene_id2original_name[gene_id] = original_name success_counter += 1 elif len(gene_id_set)>1: sys.stderr.write("Too many gene_ids for %s: %s.\n"%(original_name, gene_id_set)) elif len(gene_id_set)==0: sys.stderr.write("Linking to gene id failed for %s. gene_id_set is empty.\n"%(original_name)) else: sys.stderr.write("not supposed to happen: original_name=%s, gene_id_set=%s\n."%(original_name, gene_id_set)) counter += 1 del inf if list_type_name: #if the short name is given, forget about list_type_id glt = GeneListType.query.filter_by(short_name=list_type_name).first() #try search the db first. if not glt: glt = GeneListType(short_name=list_type_name) session.save(glt) session.flush() else: #use the list_type_id to get it glt = GeneListType.get(list_type_id) glt.original_filename = input_fname #save the filename session.save_or_update(glt) for gene_id, original_name in gene_id2original_name.iteritems(): if glt.id: #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not. rows = GeneList.query.filter_by(gene_id=gene_id).filter_by(list_type_id=glt.id) if rows.count()>0: sys.stderr.write("Gene: %s (%s) already with list type %s.\n"%(gene_id, original_name, glt.short_name)) continue gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name) session.save(gl) sys.stderr.write("%s/%s linked successfully.\n"%(success_counter, counter))
def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\ newSNPDataOutputFname=None, newSNPDataOutputFormat=1): """ 2013.07.03 added argument newSNPDataOutputFormat 2012.10.14 split out of findSNPPositionOnNewRef() """ sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\ (querySNPDataFname, newSNPDataOutputFormat)) """ Sample Geno SNP 1999010 CC cs_primer1082_247 1999068 CC cs_primer1082_247 2000022 CT cs_primer1082_247 2000064 CT cs_primer1082_247 2000117 CC cs_primer1082_247 """ inf = utils.openGzipFile(querySNPDataFname) reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) col_name2index = getColName2IndexFromHeader(reader.next()) sampleIndex = col_name2index.get("Sample") genotypeIndex = col_name2index.get("Geno") SNPIDIndex = col_name2index.get("SNP") row_id2index = {} row_id_ls = [] col_id_ls = [] col_id2index = {} row_col_index2genotype = {} for row in reader: sampleID = row[sampleIndex] genotype = row[genotypeIndex] querySNPID = row[SNPIDIndex] if querySNPID in querySNPID2NewReferenceCoordinateLs: newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID) if len(newRefCoordinateLs)==1: newRefCoordinate = newRefCoordinateLs[0] if newSNPDataOutputFormat==2: col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart) else: col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop) queryStrand = newRefCoordinate.queryStrand if col_id not in col_id2index: col_id2index[col_id] = len(col_id2index) col_id_ls.append(col_id) if sampleID not in row_id2index: row_id2index[sampleID] = len(row_id2index) row_id_ls.append(sampleID) if queryStrand == "-": genotype = SNP.reverseComplement(genotype) row_index = row_id2index[sampleID] col_index = col_id2index[col_id] row_col_index2genotype[(row_index, col_index)] = genotype else: continue data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8) for row_col_index, genotype in row_col_index2genotype.iteritems(): row_index, col_index = row_col_index[:2] data_matrix[row_index, col_index] = SNP.nt2number[genotype] sys.stderr.write("\n") snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix) snpData.tofile(newSNPDataOutputFname)
def putHaplotypeGroupIntoDB(self, session, input_fname, max_snp_typing_error_rate, snp_id_ls): """ 2009-4-10 remove tg_ecotypeid2row 2009-4-4 add argument tg_ecotypeid2row 2009-3-31 """ sys.stderr.write("Constructing haplotype groups ...\n") pattern_ecotypeid = re.compile(r'(?<=\))\d+') reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) col_name2col_index = getColName2IndexFromHeader(reader.next()) ecotypeid_idx = col_name2col_index['ecotypeid'] haplo_name_idx = col_name2col_index['haplogroup'] geographic_integrity_idx = col_name2col_index['geographic_integrity'] filtered_SNPs_idx = col_name2col_index['filtered_SNPs'] counter = 0 #for tg_ecotypeid, row in tg_ecotypeid2row.iteritems(): for row in reader: ecotypeid = int(row[ecotypeid_idx]) #ecotypeid = tg_ecotypeid #2009-4-4 use tg_ecotypeid instead haplo_name = row[haplo_name_idx] geographic_integrity_name = row[geographic_integrity_idx] filtered_SNPs = row[filtered_SNPs_idx] ref_ecotypeid = int(pattern_ecotypeid.search(haplo_name).group(0)) haplo_group = StockDB.HaploGroup.query.filter_by( short_name=haplo_name).first() if not haplo_group: haplo_group = StockDB.HaploGroup( short_name=haplo_name, ref_ecotypeid=ref_ecotypeid, max_snp_typing_error_rate=max_snp_typing_error_rate) session.save(haplo_group) session.flush() ecotype = StockDB.Ecotype.get(ecotypeid) haplo_group.ecotypes.append(ecotype) geographic_integrity = StockDB.GeographicIntegrity.query.filter_by( short_name=geographic_integrity_name).first() if not geographic_integrity: geographic_integrity = StockDB.GeographicIntegrity( short_name=geographic_integrity_name) session.save(geographic_integrity) session.flush() ecotype.geographic_integrity = geographic_integrity session.save_or_update(ecotype) #one bit of ecotype: link the ecotypeid to tg_ecotype_id #deal with filtered SNPs for i in range(len(filtered_SNPs)): allele = filtered_SNPs[i] if allele == '_': continue fc = StockDB.FilteredCalls(ecotypeid=ecotypeid, snpid=snp_id_ls[i], allele=allele) session.save(fc) session.flush() counter += 1 if counter % 500 == 0 and self.report: sys.stderr.write('%s%s' % ('\x08' * 80, counter)) session.flush() sys.stderr.write("Done.\n")
def copyAndReformatResultFile(self, db, inputFname=None, db_entry=None, user=None, output_fname=None): """ 2011-2-22 Locus are now identified as Snps.id / CNV.id in association result files. (chr, pos) before. 2009-1-7 insert float into the middle below column_5th=int(float(row[4])) #int('89.0') would raise an exception 2008-11-12 parse lines with column_6(genotype_var_perc) and more (comment) 2008-09-30 deal with 5-column file. The 5-th column is minor allele count. also return True in the end. return False if error in the middle. 2008-08-19 add original_filename to ResultsMethod 2008-07-16 if inputFname is neither file name nor file object, exit the program better handling of the column_4th and its header 2008-07-16 if it's 4-column, the last one is MAF. can't deal with segment score anymore. 2008-05-30 merged with store_file() dump the file onto file system storage if output_fname is given db submission is too slow 2008-05-26 inputFname from plone is not file object although it has file object interface. 2008-05-26 csv.Sniffer() can't figure out delimiter if '\n' is in the string, use own dumb function figureOutDelimiter() 2008-05-25 save marker(snps) in database if it's not there. use marker id in results table 2008-05-24 figure out delimiter automatically inputFname could be a file object (from plone) phenotype method doesn't go with results anymore. it goes with results_method 2008-04-28 changed to use Stock_250kDatabase (SQLAlchemy) to do db submission """ if (isinstance(inputFname, str) or isinstance(inputFname, unicode)) and os.path.isfile(inputFname): sys.stderr.write("Submitting results from %s ..."%(os.path.basename(inputFname))) delimiter = figureOutDelimiter(inputFname) reader = csv.reader(open(inputFname), delimiter=delimiter) db_entry.original_filename = inputFname elif hasattr(inputFname, 'readline') or hasattr(inputFname, 'read'): #inputFname is not a file name, but direct file object. it could also be <ZPublisher.HTTPRequest.FileUpload instance at 0xa1774f4c> sys.stderr.write("Submitting results from %s on plone ..."%inputFname.filename) cs = csv.Sniffer() inputFname.seek(0) #it's already read by plone to put int data['inputFname'], check results2db_250k.py if getattr(inputFname, 'readline', None) is not None: test_line = inputFname.readline() delimiter = cs.sniff(test_line).delimiter else: test_line = inputFname.read(200) delimiter = figureOutDelimiter(test_line) #counting is a safer solution. if test_line include '\n', cs.sniff() won't figure it out. inputFname.seek(0) reader = csv.reader(inputFname, delimiter=delimiter) if getattr(inputFname, 'filename', None): db_entry.original_filename = getattr(inputFname, 'filename', None) else: db_entry.original_filename = getattr(inputFname, 'name', None) else: sys.stderr.write("Error: %s is neither a file name nor a file object.\n"%inputFname) sys.exit(4) if output_fname: if os.path.isfile(output_fname): sys.stderr.write("Error: file %s already exists. Skip.\n"%output_fname) return False writer = csv.writer(open(output_fname, 'w'), delimiter='\t') elif self.marker_pos2snp_id is None: self.marker_pos2snp_id = self.get_marker_pos2snp_id(db) header_outputted = 0 no_of_lines = 0 session = db.session for row in reader: #check if 1st line is header or not if no_of_lines ==0 and self.pa_has_characters.search(row[1]): #check the 2nd one, which is strict digits. while the 1st column, chromosome could be 'X' or something continue snp_id = int(row[0]) if row[1] and row[1]!='0': #2011-2-22 something on 2nd column. wrong format. chr = int(row[0]) start_pos = int(row[1]) sys.stderr.write("Error: current version doesn't take chr,pos as marker ID anymore. Has to be one id (either Snps.id or CNV.id).\n") sys.exit(4) score = row[2] stop_pos = None column_4th = None column_5th = None column_6 = None rest_of_row = [] rest_of_header = [] #marker_name = '%s_%s'%(chr, start_pos) #2011-2-22 if len(row)>=4: column_4th=row[3] #stop_pos = int(row[2]) #score = row[3] if len(row)>=5: #column_4th=row[3] column_5th=int(float(row[4])) #2009-1-7 int('89.0') would raise an exception if len(row)>=6: column_6 = row[5] if len(row)>=7: rest_of_row = row[6:] rest_of_header = ['beta%s'%i for i in range(len(rest_of_row))] #sys.stderr.write("ERROR: Found %s columns.\n"%(len(row))) #return False if output_fname: #go to file system if not header_outputted: #3-column or 4-column header #if stop_pos is not None: #2011-2-22 # position_header = ['start_position', 'stop_position'] #else: # position_header = ['position'] header = ['snp_id', 'none', 'score'] #2011-2-22 if column_4th is not None: header.append('MAF') if column_5th is not None: header.append('MAC') #Minor Allele Count if column_6 is not None: header.append('genotype_var_perc') #genotype variance percentage if rest_of_row: header += rest_of_header writer.writerow(header) header_outputted = 1 #data_row = [chr, start_pos] #2011-2-22 data_row = [snp_id, ''] #2011-2-22 #if stop_pos is not None: #2011-2-22 # data_row.append(stop_pos) data_row.append(score) if column_4th is not None: data_row.append(column_4th) if column_5th is not None: data_row.append(column_5th) if db_entry.no_of_accessions is None: #calculate the no_of_accessions based on MAC/MAF db_entry.no_of_accessions = int(round(float(column_5th)/float(column_4th))) if column_6 is not None: data_row.append(column_6) if rest_of_row: data_row += rest_of_row writer.writerow(data_row) """ # 2011-2-22 store the results directly into db. only for old SNP association results. else: key = (chr, start_pos, stop_pos) if key in self.marker_pos2snp_id: snps_id = self.marker_pos2snp_id[key] if isinstance(snps_id, SNPs): #it's a new marker object r = Results(score=score) r.snps = snps_id else: #others are all integer ids r = Results(snps_id=snps_id, score=score) else: #construct a new marker marker = SNPs(name=marker_name, chromosome=chr, position=start_pos, end_position=stop_pos, created_by=user) #save it in database to get id session.add(marker) self.marker_pos2snp_id[key] = marker #for the next time to encounter same marker self.is_new_marker_added = True #set this flag as new marker was inputted into the dict r = Results(score=score) r.snps = marker del marker r.results_method = db_entry session.add(r) del r """ no_of_lines += 1 del reader if output_fname: del writer sys.stderr.write("Done.\n") return 0
def traverse(self): """ 2012.1.9 """ newHeader = [] key2dataLs = { } #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs delimiter = None for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() try: #if isCSVReader: header = reader.next() #else: # header = inputFile.readline().strip().split() #whatever splits them self.handleNewHeader(header, newHeader, self.keyColumnLs, self.valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: #2012.8.10 inputFile.seek(0) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None: for row in reader: #if not isCSVReader: # row = row.strip().split() try: self.handleValueColumns( row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=self.valueColumnLs) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Ignore this row: %s.\n' % repr(row)) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() del reader if self.noHeader: #2012.8.10 newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData
def submit_results(cls, db, input_fname, rm, user, output_fname=None): """ 2009-1-7 insert float into the middle below column_5th=int(float(row[4])) #int('89.0') would raise an exception 2008-11-12 parse lines with column_6(genotype_var_perc) and more (comment) 2008-09-30 deal with 5-column file. The 5-th column is minor allele count. also return True in the end. return False if error in the middle. 2008-08-19 add original_filename to ResultsMethod 2008-07-16 if input_fname is neither file name nor file object, exit the program better handling of the column_4th and its header 2008-07-16 if it's 4-column, the last one is MAF. can't deal with segment score anymore. 2008-05-30 merged with store_file() dump the file onto file system storage if output_fname is given db submission is too slow 2008-05-26 input_fname from plone is not file object although it has file object interface. 2008-05-26 csv.Sniffer() can't figure out delimiter if '\n' is in the string, use own dumb function figureOutDelimiter() 2008-05-25 save marker(snps) in database if it's not there. use marker id in results table 2008-05-24 figure out delimiter automatically input_fname could be a file object (from plone) phenotype method doesn't go with results anymore. it goes with results_method 2008-04-28 changed to use Stock_250kDatabase (SQLAlchemy) to do db submission """ if isinstance(input_fname, str) and os.path.isfile(input_fname): sys.stderr.write("Submitting results from %s ..."%(os.path.basename(input_fname))) delimiter = figureOutDelimiter(input_fname) reader = csv.reader(open(input_fname), delimiter=delimiter) rm.original_filename = input_fname elif hasattr(input_fname, 'readline') or hasattr(input_fname, 'read'): #input_fname is not a file name, but direct file object. it could also be <ZPublisher.HTTPRequest.FileUpload instance at 0xa1774f4c> sys.stderr.write("Submitting results from %s on plone ..."%input_fname.filename) cs = csv.Sniffer() input_fname.seek(0) #it's already read by plone to put int data['input_fname'], check results2db_250k.py if getattr(input_fname, 'readline', None) is not None: test_line = input_fname.readline() delimiter = cs.sniff(test_line).delimiter else: test_line = input_fname.read(200) delimiter = figureOutDelimiter(test_line) #counting is a safer solution. if test_line include '\n', cs.sniff() won't figure it out. input_fname.seek(0) reader = csv.reader(input_fname, delimiter=delimiter) if getattr(input_fname, 'filename', None): rm.original_filename = getattr(input_fname, 'filename', None) else: rm.original_filename = getattr(input_fname, 'name', None) else: sys.stderr.write("Error: %s is neither a file name nor a file object.\n"%input_fname) sys.exit(4) if output_fname: writer = csv.writer(open(output_fname, 'w'), delimiter='\t') elif cls.marker_pos2snp_id is None: cls.marker_pos2snp_id = cls.get_marker_pos2snp_id(db) header_outputted = 0 no_of_lines = 0 session = db.session for row in reader: #check if 1st line is header or not if no_of_lines ==0 and cls.pa_has_characters.search(row[1]): #check the 2nd one, which is strict digits. while the 1st column, chromosome could be 'X' or something continue chr = int(row[0]) start_pos = int(row[1]) score = row[2] stop_pos = None column_4th = None column_5th = None column_6 = None rest_of_row = [] rest_of_header = [] marker_name = '%s_%s'%(chr, start_pos) if len(row)>=4: column_4th=row[3] #stop_pos = int(row[2]) #score = row[3] if len(row)>=5: #column_4th=row[3] column_5th=int(float(row[4])) #2009-1-7 int('89.0') would raise an exception if len(row)>=6: column_6 = row[5] if len(row)>=7: rest_of_row = row[6:] rest_of_header = ['beta%s'%i for i in range(len(rest_of_row))] #sys.stderr.write("ERROR: Found %s columns.\n"%(len(row))) #return False if output_fname: #go to file system if not header_outputted: #3-column or 4-column header if stop_pos is not None: position_header = ['start_position', 'stop_position'] else: position_header = ['position'] header = ['chromosome'] + position_header + ['score'] if column_4th is not None: header.append('MAF') if column_5th is not None: header.append('MAC') #Minor Allele Count if column_6 is not None: header.append('genotype_var_perc') #genotype variance percentage if rest_of_row: header += rest_of_header writer.writerow(header) header_outputted = 1 data_row = [chr, start_pos] if stop_pos is not None: data_row.append(stop_pos) data_row.append(score) if column_4th is not None: data_row.append(column_4th) if column_5th is not None: data_row.append(column_5th) if column_6 is not None: data_row.append(column_6) if rest_of_row: data_row += rest_of_row writer.writerow(data_row) else: key = (chr, start_pos, stop_pos) if key in cls.marker_pos2snp_id: snps_id = cls.marker_pos2snp_id[key] if isinstance(snps_id, SNPs): #it's a new marker object r = Results(score=score) r.snps = snps_id else: #others are all integer ids r = Results(snps_id=snps_id, score=score) else: #construct a new marker marker = SNPs(name=marker_name, chromosome=chr, position=start_pos, end_position=stop_pos, created_by=user) #save it in database to get id session.save(marker) cls.marker_pos2snp_id[key] = marker #for the next time to encounter same marker cls.is_new_marker_added = True #set this flag as new marker was inputted into the dict r = Results(score=score) r.snps = marker del marker r.results_method = rm session.save(r) del r no_of_lines += 1 del reader if output_fname: del writer sys.stderr.write("Done.\n") return True
def run(self): if self.debug: import pdb pdb.set_trace() no_of_result1_peaks_ls = [] no_of_result2_peaks_ls = [] fraction_of_result1_peaks_in_result2_ls = [] fraction_of_result2_peaks_in_result1_ls = [] no_of_combined_peaks_ls = [] fraction_of_overlap_in_combined_peaks_ls = [] for inputFname in self.inputFnameLs: reader = csv.reader(open(inputFname), delimiter=figureOutDelimiter(inputFname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) no_of_result1_peaks_index = col_name2index.get( "no_of_result1_peaks") no_of_result2_peaks_index = col_name2index.get( "no_of_result2_peaks") no_of_result1_peaks_in_result2_index = col_name2index.get( "no_of_result1_peaks_in_result2") no_of_result2_peaks_in_result1_index = col_name2index.get( "no_of_result2_peaks_in_result1") for row in reader: no_of_result1_peaks = float(row[no_of_result1_peaks_index]) no_of_result2_peaks = float(row[no_of_result2_peaks_index]) no_of_result1_peaks_in_result2 = float( row[no_of_result1_peaks_in_result2_index]) no_of_result2_peaks_in_result1 = float( row[no_of_result2_peaks_in_result1_index]) no_of_result1_peaks_ls.append(no_of_result1_peaks) no_of_result2_peaks_ls.append(no_of_result2_peaks) fraction_of_result1_peaks_in_result2_ls.append( no_of_result1_peaks_in_result2 / no_of_result1_peaks) fraction_of_result2_peaks_in_result1_ls.append( no_of_result2_peaks_in_result1 / no_of_result2_peaks) no_of_combined_peaks_ls.append(no_of_result1_peaks + no_of_result2_peaks) fraction_of_overlap_in_combined_peaks_ls.append( (no_of_result1_peaks_in_result2 + no_of_result2_peaks_in_result1) / (no_of_result1_peaks + no_of_result2_peaks)) del reader title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls)) if len(fraction_of_result1_peaks_in_result2_ls) > 10: medianFraction = numpy.median( fraction_of_result1_peaks_in_result2_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_result1_peaks_in_result2_ls, title=title, \ xlabel_1D="fraction of result1 peaks in result2", xticks=None, \ outputFname="%s_hist_of_fraction_of_result1_peaks_in_result2.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s pairs" % (len(fraction_of_result2_peaks_in_result1_ls)) if len(fraction_of_result2_peaks_in_result1_ls) > 10: medianFraction = numpy.median( fraction_of_result2_peaks_in_result1_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_result2_peaks_in_result1_ls, title=title, \ xlabel_1D="fraction of result2 peaks in result1", xticks=None, \ outputFname="%s_hist_of_fraction_of_result2_peaks_in_result1.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s pairs" % (len(fraction_of_overlap_in_combined_peaks_ls)) if len(fraction_of_overlap_in_combined_peaks_ls) > 10: medianFraction = numpy.median( fraction_of_overlap_in_combined_peaks_ls) title += " median %.3f" % (medianFraction) yh_matplotlib.drawHist(fraction_of_overlap_in_combined_peaks_ls, title=title, \ xlabel_1D="fraction of recurrent peaks in combined", xticks=None, \ outputFname="%s_hist_of_fraction_of_recurrent_peaks_in_combined.png"%self.outputFnamePrefix, \ min_no_of_data_points=20, needLog=False, \ dpi=200) title = "%s results" % (len(no_of_result1_peaks_ls)) yh_matplotlib.drawScatter(no_of_result1_peaks_ls, no_of_result2_peaks_ls, \ fig_fname="%s_no_of_peaks_result1_vs_result2.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result1', \ ylabel='No. of peaks in result2', dpi=300) title = "%s results" % (len(no_of_result1_peaks_ls)) yh_matplotlib.drawScatter(no_of_result1_peaks_ls, fraction_of_result1_peaks_in_result2_ls, \ fig_fname="%s_result1_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result1', \ ylabel='Fraction found in result2', dpi=300) title = "%s results" % (len(no_of_result2_peaks_ls)) yh_matplotlib.drawScatter(no_of_result2_peaks_ls, fraction_of_result2_peaks_in_result1_ls, \ fig_fname="%s_result2_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks in result2', \ ylabel='Fraction found in result1', dpi=300) title = "%s pairs" % (len(fraction_of_result1_peaks_in_result2_ls)) yh_matplotlib.drawScatter(fraction_of_result1_peaks_in_result2_ls, fraction_of_result2_peaks_in_result1_ls, \ fig_fname="%s_1_fraction_in2_vs_2_fraction_in1.png"%self.outputFnamePrefix, \ title=title, xlabel='result1 fraction found in result2', \ ylabel='result2 fraction found in result1', dpi=300) title = "%s pairs" % (len(no_of_combined_peaks_ls)) yh_matplotlib.drawScatter(no_of_combined_peaks_ls, fraction_of_overlap_in_combined_peaks_ls, \ fig_fname="%s_combined_no_of_peak_vs_fraction.png"%self.outputFnamePrefix, \ title=title, xlabel='No. of peaks combined', \ ylabel='Fraction recurrent', dpi=300)
def traverse(self): """ self.noHeader: #2012.8.10 2012.1.9 """ newHeader = [] key2dataLs = {} #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs delimiter = None noOfDataColumnsFromPriorFiles = 0 for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() valueColumnLs = [] try: header = reader.next() self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: #2012.8.10 inputFile.seek(0) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None and valueColumnLs: visitedKeySet = set() for row in reader: try: self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, \ valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles=noOfDataColumnsFromPriorFiles, \ visitedKeySet=visitedKeySet) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Ignore this row: %s.\n'%repr(row)) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() del reader #append empty data to keys who are not present in this current "reader" file totalKeySet = set(key2dataLs.keys()) unvisitedKeySet = totalKeySet - visitedKeySet for key in unvisitedKeySet: for i in valueColumnLs: key2dataLs[key].append('') noOfDataColumnsFromPriorFiles += len(valueColumnLs) if self.noHeader: #2012.8.10 newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData
def putGeneListIntoDb(self, input_fname, list_type_id, list_type_name, gene_symbol2gene_id_set, db, skip_1st_line=False): """ 2009-10-18 If first column (gene symbol) is full of numbers, it's taken as a legitimate Gene ID. 2009-2-4 use getGeneIDSetGivenAccVer() from pymodule.utils to find gene_id_set 2008-01-08 add option skip_1st_line stop using csv.reader, use raw file handler instead figureOutDelimiter() is modified not to use csv.Sniffer() by default. it'll return delimiter None if the file is single-column. 2008-12-11 more filtering: 1. strip the original_name 2. pick alphanumeric characters out of original_name if GeneListType is already in db. check if GeneList has this gene already or not. 2008-11-20 use figureOutDelimiter() to get delimiter automatically 2008-07-15 if the list_type_name is given, forget about list_type_id. program will first search db for the given list_type_name, if search failed, create a new entry. 2008-07-15 use gene_id2original_name to avoid redundancy in gene list """ import csv, sys, os session = db.session delimiter = figureOutDelimiter(input_fname) inf = open(input_fname) #2008-11-20 if skip_1st_line: inf.next() #skips the 1st line counter = 0 success_counter = 0 gene_id2original_name = {} #to avoid redundancy in gene list for line in inf: if line == '\n': #skip empty lines continue row = line.split(delimiter) original_name = row[0].strip( ) #2008-12-11 remove spaces/tabs in the beginning/end all_number_p_search_result = self.all_number_p.search( original_name) if all_number_p_search_result: # 2009-10-18 original_name is full of numbers. a legitimate Gene ID. ecotypeid = int(all_number_p_search_result.group(0)) gene_id_set = set([ecotypeid]) else: gene_id_set = getGeneIDSetGivenAccVer(original_name, gene_symbol2gene_id_set) if gene_id_set == None: sys.stderr.write( "Linking to gene id failed for %s. No such in gene_symbol2gene_id_set.\n" % (original_name)) elif len(gene_id_set) == 1: gene_id = list(gene_id_set)[0] if gene_id not in gene_id2original_name: gene_id2original_name[gene_id] = original_name success_counter += 1 elif len(gene_id_set) > 1: sys.stderr.write("Too many gene_ids for %s: %s.\n" % (original_name, gene_id_set)) elif len(gene_id_set) == 0: sys.stderr.write( "Linking to gene id failed for %s. gene_id_set is empty.\n" % (original_name)) else: sys.stderr.write( "not supposed to happen: original_name=%s, gene_id_set=%s\n." % (original_name, gene_id_set)) counter += 1 del inf if list_type_name: #if the short name is given, forget about list_type_id glt = GeneListType.query.filter_by( short_name=list_type_name).first() #try search the db first. if not glt: glt = GeneListType(short_name=list_type_name) session.save(glt) session.flush() else: #use the list_type_id to get it glt = GeneListType.get(list_type_id) glt.original_filename = input_fname #save the filename session.save_or_update(glt) for gene_id, original_name in gene_id2original_name.iteritems(): if glt.id: #2008-12-11 GeneListType is already in db. check if GeneList has this gene already or not. rows = GeneList.query.filter_by(gene_id=gene_id).filter_by( list_type_id=glt.id) if rows.count() > 0: sys.stderr.write( "Gene: %s (%s) already with list type %s.\n" % (gene_id, original_name, glt.short_name)) continue gl = GeneList(gene_id=gene_id, list_type=glt, original_name=original_name) session.save(gl) sys.stderr.write("%s/%s linked successfully.\n" % (success_counter, counter))
def run(self): if self.debug: import pdb pdb.set_trace() #['trio_set', 'chromosome', 'pos', 'depthOfFather','depthOfMother', 'depthOfChild', 'isInconsistent'] chr_pos2inconsistentData = {} #key is (chr,pos), #value is (noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo) sys.stderr.write("Reading from %s files ...\n"%(len(self.inputFnameLs))) for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): continue reader = None trioSetStrIndex = None chromosomeIndex = None posIndex = None isInconsistentIndex = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = reader.next() col_name2index = getColName2IndexFromHeader(header) trioSetStrIndex = col_name2index.get("#trio_set") chromosomeIndex = col_name2index.get("chromosome") posIndex = col_name2index.get("pos") isInconsistentIndex = col_name2index.get("isInconsistent") except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None and isInconsistentIndex is not None: for row in reader: trio_set_str = row[trioSetStrIndex] chromosome = row[chromosomeIndex] pos = int(row[posIndex]) isInconsistent = int(row[isInconsistentIndex]) chr_pos = (chromosome, pos) if chr_pos not in chr_pos2inconsistentData: chr_pos2inconsistentData[chr_pos] = [0, 0, 0, 0] #trio_set_ls = trio_set_str.split(',') if trio_set_str.find("0")==0 or trio_set_str.find(",0")!=-1: #it's a duo. one parent is missing. chr_pos2inconsistentData[chr_pos][2] += isInconsistent chr_pos2inconsistentData[chr_pos][3] += 1 else: #it's a trio chr_pos2inconsistentData[chr_pos][0] += isInconsistent chr_pos2inconsistentData[chr_pos][1] += 1 sys.stderr.write("Done.\n") sys.stderr.write("Outputting ...") writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow(['#chromosome', 'pos', 'noOfInconsistencyInTrio', 'noOfTotalInTrio', 'inconsistencyRateInTrio',\ 'noOfInconsistencyInDuo', 'noOfTotalInDuo', 'inconsistencyRateInDuo']) chr_pos_ls = chr_pos2inconsistentData.keys() chr_pos_ls.sort() for chr_pos in chr_pos_ls: chromosome, pos = chr_pos noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo = chr_pos2inconsistentData.get(chr_pos) if noOfTotalInTrio>0: inconsistencyRateInTrio = noOfInconsistencyInTrio/float(noOfTotalInTrio) else: inconsistencyRateInTrio = -1 if noOfTotalInDuo>0: inconsistencyRateInDuo = noOfInconsistencyInDuo/float(noOfTotalInDuo) else: inconsistencyRateInDuo = -1 writer.writerow([chromosome, pos, noOfInconsistencyInTrio, noOfTotalInTrio, inconsistencyRateInTrio,\ noOfInconsistencyInDuo, noOfTotalInDuo, inconsistencyRateInDuo]) del writer sys.stderr.write("Done.\n")
def get_snp_pair2value_type(self, boolean_pair_fname, gene_id_set=None): """ 2009-1-20 add argument gene_id_set, to limit input only to those genes report the progress 2008-11-25 """ sys.stderr.write("Getting snp_pair2value_type ...\n") snp_pair2value_type = {} reader = csv.reader(open(boolean_pair_fname), delimiter=figureOutDelimiter(boolean_pair_fname)) reader.next() min_value = None max_value = None counter = 0 real_counter = 0 for row in reader: snp1_id, gene1_id, snp2_id, gene2_id, bool_type, pvalue, count1, count2 = row[:8] counter += 1 if not snp2_id: snp2_id = snp1_id continue #2008-11-26 skip a row if it's pvalue from single SNP. gene1_id = int(gene1_id) if not gene2_id: gene2_id = gene1_id else: gene2_id = int(gene2_id) pvalue = float(pvalue) if pvalue==0: pvalue = 15 bool_type = -1 else: pvalue = -math.log10(float(pvalue)) value = pvalue if min_value is None: min_value =value elif value<min_value: min_value = value if max_value is None: max_value = value elif value>max_value: max_value = value if gene_id_set is not None and (gene1_id not in gene_id_set or gene2_id not in gene_id_set): continue real_counter += 1 snp1_id = snp1_id.split('_') snp1_id = map(int, snp1_id) if len(snp1_id)==2: snp1_id.append(0) snp2_id = snp2_id.split('_') snp2_id = map(int, snp2_id) if len(snp2_id)==2: snp2_id.append(0) snp_pair = [tuple(snp1_id), tuple(snp2_id)] snp_pair.sort() snp_pair = tuple(snp_pair) if bool_type: bool_type = int(bool_type) else: bool_type = 0 if snp_pair not in snp_pair2value_type: snp_pair2value_type[snp_pair] = (pvalue, bool_type) else: if pvalue>snp_pair2value_type[snp_pair][0]: #only take maximum snp_pair2value_type[snp_pair] = (pvalue, bool_type) sys.stderr.write("%s%s\t%s"%('\x08'*40, counter, real_counter)) del reader sys.stderr.write("Done.\n") return snp_pair2value_type, min_value, max_value
def run(self): """ 2010-5-25 """ if self.debug: #for one-node testing purpose import pdb pdb.set_trace() self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size - 1 # 2010-5-25 to hold final data array_id2no_of_blocks_returned = {} array_id2col_index2intensity_ls = {} if node_rank == 0: db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) reader = csv.reader(open(self.input_fname), delimiter=figureOutDelimiter(self.input_fname)) probe_id_ls = reader.next()[2:] probe_id_ls = map(int, probe_id_ls) chr_ls = reader.next()[2:] pos_ls = reader.next()[2:] chr_pos_ls = zip(chr_ls, pos_ls) commonData = self.prepareCommonData(db, self.blockSize, self.jumpStep, \ self.x_range, self.y_range, self.minNoOfProbesPerBlock, \ array_file_directory=self.array_file_directory, probe_id_ls=probe_id_ls,\ chr_pos_ls=chr_pos_ls, probeType=2, \ probes_blockData_picklef=self.probes_blockData_picklef) param_ls = self.generate_params(reader, blockDataCodedIndex_ls=commonData.blockDataCodedIndex_ls, \ ref_array_id_set=self.ref_array_id_set) #must be behind prepareCommonData() refDataMatrix = self.readInRefArrayData( self.input_fname, ref_array_id_set=self.ref_array_id_set) if self.communicator.size == 1: # single-node serial run blockDataCodedIndex_ls = commonData.blockDataCodedIndex_ls output_dir = os.path.split(self.output_fname)[0] if not os.path.isdir(output_dir): os.makedirs(output_dir) writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') probe_id_ls = commonData.probe_id_ls chr_pos_ls = commonData.chr_pos_ls self.writeHeader(writer, probe_id_ls, chr_pos_ls) for array_id, ecotype_id, blockIndex, blockIntensity_ls in param_ls: result_ls = self.ltsOneBlockAgainstAllRef(array_id, ecotype_id, blockIndex, blockIntensity_ls,\ refDataMatrix, blockDataCodedIndex_ls) self.handleComputingOutputData(result_ls, blockDataCodedIndex_ls, array_id2no_of_blocks_returned, \ array_id2col_index2intensity_ls, writer) sys.exit(0) commonData_pickle = cPickle.dumps(commonData, protocol=-1) sys.stderr.write("Passing data to output node %s from %s ... " % ( output_node_rank, node_rank, )) self.communicator.send(commonData_pickle, output_node_rank, 0) sys.stderr.write(".\n") refDataMatrix_pickle = cPickle.dumps(refDataMatrix, protocol=-1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write( "passing initial data to nodes from %s to %s ... " % (node_rank, node)) self.communicator.send(commonData_pickle, node, 0) self.communicator.send(refDataMatrix_pickle, node, 0) sys.stderr.write(".\n") if len(commonData.blockDataCodedIndex_ls) == 0: sys.stderr.write("Not a single block is formed. Exit!") sys.exit(0) del commonData, commonData_pickle, refDataMatrix, refDataMatrix_pickle elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) commonData = cPickle.loads(data) if len(commonData.blockDataCodedIndex_ls) == 0: sys.stderr.write("Not a single block is formed. Exit!") sys.exit(0) blockDataCodedIndex_ls = commonData.blockDataCodedIndex_ls del data, commonData data, source, tag, = self.communicator.receiveString(0, 0) refDataMatrix = cPickle.loads(data) del data else: data, source, tag = self.communicator.receiveString(0, 0) commonData = cPickle.loads(data) if len(commonData.blockDataCodedIndex_ls) == 0: sys.stderr.write("Not a single block is formed. Exit!") sys.exit(0) probe_id_ls = commonData.probe_id_ls chr_pos_ls = commonData.chr_pos_ls blockDataCodedIndex_ls = commonData.blockDataCodedIndex_ls del data, commonData self.synchronize() if node_rank == 0: param_obj = PassingData(param_ls=param_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator=param_ls, message_size=self.message_size) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(refDataMatrix=refDataMatrix, \ blockDataCodedIndex_ls=blockDataCodedIndex_ls) self.computing_node(computing_parameter_obj, self.computing_node_handler, output_node_rank=output_node_rank) else: output_dir = os.path.split(self.output_fname)[0] if not os.path.isdir(output_dir): os.makedirs(output_dir) writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') self.writeHeader(writer, probe_id_ls, chr_pos_ls) output_param_obj = PassingData(writer=writer, array_id2no_of_blocks_returned=array_id2no_of_blocks_returned,\ array_id2col_index2intensity_ls=array_id2col_index2intensity_ls,\ blockDataCodedIndex_ls=blockDataCodedIndex_ls) self.output_node(free_computing_nodes, output_param_obj, self.output_node_handler) del writer self.synchronize() #to avoid some node early exits
def putQCIntoDB(self, session, input_fname, no_of_lines_to_skip, data_source_obj, cnv_type_obj, cnv_method_obj=None, \ run_type=1, original_id=None, version=1): """ 2009-10-28 """ sys.stderr.write("Putting QC data into database ... \n") reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) input_file_basename = os.path.basename(input_fname) if run_type == 7: # 2010-6-14 need to skip first 4 lines (3 comment-lines + 1 header) for nucmer coords file no_of_lines_to_skip = 4 elif run_type == 8: # skip 2 lines (1 comment-line + 1 header) for breakdancer output from Quan Long no_of_lines_to_skip = 2 col_name2index = None for i in range(no_of_lines_to_skip): header = reader.next() # the last line to be skipped will be the header if col_name2index is None: col_name2index = getColName2IndexFromHeader(header) counter = 0 for row in reader: if run_type == 1: db_obj = self.generateCNVQCCallObjFromClark2007( session, row, data_source_obj, cnv_type_obj, cnv_method_obj) elif run_type == 2: db_obj = self.generateCNVQCCallObjFromSchneebergerOssowski( session, row, data_source_obj, cnv_type_obj, cnv_method_obj) elif run_type == 3: db_obj = self.generateCNVQCCallObjFromBobSchmitzData(session, row, data_source_obj, cnv_type_obj,\ cnv_method_obj, original_id=original_id) elif run_type == 4: db_obj = self.generateCNVQCCallObjFromLerContigDerivedCNVs(session, row, data_source_obj, cnv_type_obj, \ cnv_method_obj=cnv_method_obj, \ original_id=original_id, col_name2index=col_name2index) elif run_type == 5: db_obj = self.generateSequenceFragmentRefPosObjFromLerContigSpansOverCol(session, row, data_source_obj, cnv_type_obj, \ cnv_method_obj=cnv_method_obj, \ original_id=original_id, col_name2index=col_name2index, version=version) elif run_type == 6: db_obj = self.generateSequenceFragment2ProbeObj(session, row, data_source_obj, cnv_type_obj, \ cnv_method_obj=cnv_method_obj, \ original_id=original_id, col_name2index=col_name2index) elif run_type == 7: db_obj = self.generateSequenceFragmentRefPosObjFromNucmerLerContigSpansOverCol(session, row, data_source_obj, \ cnv_type_obj, \ cnv_method_obj=cnv_method_obj, \ original_id=original_id, col_name2index=col_name2index,\ version=version, comment=input_file_basename) elif run_type == 8: db_obj = self.generateCNVQCCallObjFromQuanLongBreakDancerOutput(session, row, data_source_obj, \ cnv_type_obj, cnv_method_obj=cnv_method_obj,\ original_id=original_id, col_name2index=col_name2index) elif run_type == 9: db_obj = self.generateCNVQCCallObjFromQuanLongCoverageDerived(session, row, data_source_obj, \ cnv_type_obj=cnv_type_obj, \ cnv_method_obj=cnv_method_obj, col_name2index=col_name2index) else: sys.stderr.write("Run type %s not supported.\n" % run_type) if db_obj: session.add(db_obj) session.flush() counter += 1 if counter % 5000 == 0: sys.stderr.write("%s%s" % ('\x08' * 40, counter)) session.flush() sys.stderr.write("%s records. Done.\n" % counter)
def run(self): """ 2008-12-02 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.phenotype_fname and self.phenotype_method_id: header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id]))[0] phenotype_label = phenData.col_id_ls[phenotype_col_index] phenotype_f = open( '%s_%s.pheno' % (self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w') for phenotype_value in phenData.data_matrix[:, phenotype_col_index]: if self.phenotype_is_binary: #binary and non-binary have different NA designator if numpy.isnan(phenotype_value): phenotype_value = 9 else: phenotype_value = int(phenotype_value) else: if numpy.isnan(phenotype_value): phenotype_value = -100.0 phenotype_f.write('%s\n' % phenotype_value) del phenotype_f genotype_f = open('%s.geno' % self.output_fname_prefix, 'w') ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'), delimiter='\t') snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'), delimiter='\t') #transpose it newSnpData = transposeSNPData(newSnpData) no_of_rows = len(newSnpData.data_matrix) no_of_cols = len(newSnpData.data_matrix[0]) for i in range(no_of_rows): snp_id = newSnpData.row_id_ls[i] chr, pos = snp_id.split('_') allele1 = allele_index2allele_ls[i][0] #major allele allele2 = allele_index2allele_ls[i][1] #minor allele snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2]) geno_line = '' for j in range(no_of_cols): if i == 0: #write out the accessions ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case']) allele = newSnpData.data_matrix[i][j] if allele == 0: geno_line += '0' elif allele == 1: geno_line += '2' else: geno_line += '9' geno_line += '\n' genotype_f.write(geno_line) del genotype_f, ind_writer, snp_writer
def predictALLSegments(self, input_fname, array_id2model_array_id_ls, array_id2model,\ max_amplitude=-0.1, param_obj=None): """ 2010-7-25 handle the situation that any arrays has >=3 model-arrays 2010-7-1 """ sys.stderr.write('Predicting for all segments from %s ... \n'%(input_fname)) reader = csv.reader(open(input_fname), delimiter=figureOutDelimiter(input_fname)) header = reader.next() col_name2index = getColName2IndexFromHeader(header) median_col_index = col_name2index.get('median') ecotype_id_idx = col_name2index.get('ecotype_id', col_name2index.get('array_id')) counter = 0 no_of_segments_in_model = 0 no_of_predicted_deletions = 0 for row in reader: counter += 1 amplitude = float(row[col_name2index['amplitude']]) if amplitude>max_amplitude: continue cnv_ecotype_id = int(row[ecotype_id_idx]) array_id = int(row[col_name2index.get('array_id')]) if array_id not in array_id2model_array_id_ls: continue no_of_probes = int(row[col_name2index['length']]) start_probe = row[col_name2index['start_probe']].split('_') # split chr_pos start_probe = map(int, start_probe) start_probe_id = row[col_name2index['start_probe_id']] stop_probe = row[col_name2index['end_probe']].split('_') stop_probe = map(int, stop_probe) stop_probe_id = row[col_name2index['end_probe_id']] segment_chromosome = start_probe[0] if start_probe[0]!=stop_probe[0]: #spurious. on different chromosomes. continue segment_start_pos = start_probe[1]-12 segment_stop_pos = stop_probe[1]+12 segment_length = abs(segment_stop_pos-segment_start_pos+1) if median_col_index is not None: median_intensity = float(row[median_col_index]) else: median_intensity = None cnv_segment_obj = PassingData(ecotype_id=cnv_ecotype_id, start_probe=start_probe, stop_probe=stop_probe,\ no_of_probes=no_of_probes, amplitude=amplitude, segment_length=segment_length,\ segment_chromosome=segment_chromosome, array_id=array_id,\ start_probe_id=start_probe_id, stop_probe_id=stop_probe_id,\ segment_start_pos=segment_start_pos, segment_stop_pos=segment_stop_pos,\ median_intensity=median_intensity) model_array_id_ls = array_id2model_array_id_ls.get(array_id) no_of_segments_in_model += 1 label_predicted, label_predicted2probability = self.predictOneSegmentByMultipleModels(cnv_segment_obj, \ model_array_id_ls, array_id2model) if label_predicted==-1: # predicted to be deletion. cnv_segment_obj.probability = label_predicted2probability[-1] cnv_segment_obj.comment = 'model arrays: %s'%(repr(model_array_id_ls)[1:-1]) self.saveSegmentObj(param_obj, cnv_segment_obj) no_of_predicted_deletions += 1 if no_of_predicted_deletions%5000==0: sys.stderr.write('%s%s\t%s\t%s'%('\x08'*100, counter, no_of_segments_in_model, no_of_predicted_deletions)) sys.stderr.write('%s%s\t%s\t%s\n'%('\x08'*100, counter, no_of_segments_in_model, no_of_predicted_deletions)) sys.stderr.write('%s out of %s segments were used in prediction. %s predicted deletions.\n'%\ (no_of_segments_in_model, counter, no_of_predicted_deletions))