def buildIntervalTree(bed_file): ''' Build interval tree from BED file. Input BED file must have at least 4 columns ''' ranges = {} printlog("reading " + bed_file + '...') for line in ireader.reader(bed_file): if line.startswith("track"): continue if line.startswith("#"): continue if line.startswith('browser'): continue if line.startswith('Chrom'): continue fields = line.rstrip('\n ').split() if len(fields) < 4: continue chrom = fields[0] start = int(fields[1]) end = int(fields[2]) score = fields[3] if start < 0: continue if end < 0: continue if start > end: continue if chrom not in ranges: ranges[chrom] = Intersecter() ranges[chrom].add_interval(Interval(start, end, value=score)) else: ranges[chrom].add_interval(Interval(start, end, value=score)) return ranges
def chrom_count(infile): ''' count chrom frequencies from BED file ''' chrom_count = collections.defaultdict(int) for l in ireader.reader(infile): if l.startswith('#'): continue if l.startswith('track'): continue if l.startswith('browser'): continue f = l.split() if len(f) < 3: print("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr) continue try: start = int(f[1]) end = int(f[2]) if start > end: print("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr) continue except: print("Not in valid BED format. Skip:" + l, file=sys.stderr) continue chrom_count[f[0]] += 1 return chrom_count
def read_bed_as_list(bedfile): ''' bedfile file should have at least 3 columns (Chrom, chromStart, chromEnd). if no strand information found in the 6th column. All regions will be considered on "+" strand. ''' lst = [] for l in ireader.reader(bedfile): if l.startswith('#'): continue if l.startswith('track'): continue if l.startswith('browser'): continue f = l.split() try: chrom = f[0] start = int(f[1]) end = int(f[2]) if start > end: print("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr) continue except: print("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr) lst.append([chrom, start, end]) return lst
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="BED6 file specifying the C position. This BED file should have at least 6 columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.") parser.add_option("-r","--region",action="store",type="string",dest="region_file",help="BED file specificy genomic regions. This BED file should have at least 3 columns (Chrom, ChromStart, ChromeEnd).") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of output file.") (options,args)=parser.parse_args() print () if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.region_file): print (__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file + '.txt','w') #step1: read CpG file printlog("Reading CpG file: \"%s\"" % (options.input_file)) cpg_ranges = read_CpG_bed(options.input_file) #step2: read region file printlog("Reading BED file: \"%s\"" % (options.region_file)) printlog("Writing to: \"%s\"" % (options.out_file + '.txt')) region_list = [] for l in ireader.reader(options.region_file): if l.startswith('#'): continue if l.startswith('track'): continue if l.startswith('browser'): continue f = l.split() if len(f) < 3: continue try: chrom = f[0] st = int(f[1]) end = int(f[2]) except: print (l + '\t' + '\t'.join(['NA']*6, file=FOUT)) continue tmp = stats_over_range(cpg_ranges, chrom, st, end) print (l + '\t' + '\t'.join([str(i) for i in tmp]), file=FOUT) FOUT.close()
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta or M values with the 1st row containing sample IDs and the 1st column containing CpG IDs. This file can be a regular text file or compressed file (.gz, .bz2).") parser.add_option("-d","--dtype",action="store",type='string', dest="data_type",default="Beta", help="Input data type either \"Beta\" or \"M\". default=%default") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The output file.") (options,args)=parser.parse_args() print () if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.data_type): print (__doc__) parser.print_help() sys.exit(101) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file, 'w') if options.data_type.lower() == "beta": printlog("Convert Beta-value file \"%s\" into M-value file \"%s\" ..." % (options.input_file, options.out_file)) elif options.data_type.lower() == "m": printlog("Convert M-value file \"%s\" into Beta-value file \"%s\" ..." % (options.input_file, options.out_file)) else: print ("Data type must be \"Beta\" or \"M\"", file=sys.stderr) sys.exit(0) line_num = 1 for l in ireader.reader(options.input_file): f = l.split() if line_num == 1: print (l, file=FOUT) else: probe_ID = f[0] input_values = f[1:] output_values = [] for iv in input_values: #deal with non-numerical values try: if options.data_type.lower() == "beta": ov = np.log2(float(iv)/(1.0 - float(iv))) elif options.data_type.lower() == "m": ov = (2**float(iv))/(2**float(iv) + 1) except: ov = np.nan output_values.append(ov) print (probe_ID + '\t' + '\t'.join([str(i) for i in output_values]), file=FOUT) line_num += 1 FOUT.close()
def getBasalDomains(bedfile, up=5000, down=1000, printit=False): ''' Define gene's basal regulatory domain. bedfile: one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript. up: size of extension to upstream of TSS down: size of extension to downstream of TSS ''' basal_ranges = {} for l in ireader.reader(bedfile): if l.startswith('#'): continue if l.startswith('track'): continue if l.startswith('browser'): continue f = l.split() try: chrom = f[0] start = int(f[1]) end = int(f[2]) symbol = f[3] strand = f[5] if start > end: print("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr) continue except: print("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr) continue if chrom not in basal_ranges: basal_ranges[chrom] = IntervalTree() if strand == '+': basal_st = (start + 1) - up basal_end = (start + 1) + down basal_st = max(0, basal_st) basal_ranges[chrom].insert_interval( Interval(basal_st, basal_end, strand='+', value=symbol)) elif strand == '-': basal_st = end - down basal_end = end + up basal_st = max(0, basal_st) basal_ranges[chrom].insert_interval( Interval(basal_st, basal_end, strand='-', value=symbol)) if printit: print('\t'.join([ str(i) for i in (chrom, basal_st, basal_end, symbol, '0', strand) ]), file=sys.stdout) return basal_ranges
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input data file (Tab-separated) with a certain column containing 450K/850K array CpG IDs. This file can be regular text file or compressed file (.gz, .bz2).") parser.add_option("-a","--annotation",action="store",type="string",dest="anno_file",help="Annotation file. This file can be regular text file or compressed file (.gz, .bz2).") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.") parser.add_option("-p","--probe_column",action="store",type='int', dest="probe_col",default=0, help="The number of column that contains probe IDs. Note: the column index starts with 0. default=%default.") parser.add_option("-l", "--header", action="store_true", dest="header", default=False, help="Input data file has a header row.") (options,args)=parser.parse_args() if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.anno_file): print (__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) if not os.path.isfile(options.input_file): print ("Input data file \"%s\" does not exist\n" % options.input_file) sys.exit(104) if not os.path.isfile(options.anno_file): print ("Input annotation file \"%s\" does not exist\n" % options.input_file) sys.exit(105) printlog("Read annotation file \"%s\" ..." % (options.anno_file)) (header, data)= read_annotation(options.anno_file) OUT = open(options.out_file + '.anno.txt','w') printlog("Add annotation information to \"%s\" ..." % (options.input_file)) line_num = 0 for l in ireader.reader(options.input_file): line_num += 1 f = l.split() if (line_num == 1 and options.header): print (l + '\t' + '\t'.join(header), file=OUT) else: if options.probe_col >= len(f): print ("Error: column ID must be smaller than %d!" % len(f), file=sys.stderr) sys.exit(0) cgid = f[options.probe_col] try: print (l + '\t' + data[cgid],file=OUT) except: print (l + '\t' + '\t'.join(['NA']*len(header)), file=OUT) OUT.close()
def read_annotation(infile): head = [] cpg_infor = {} for l in ireader.reader(infile): if l.startswith('probeID'): head = l.split()[1:] else: f = l.split() cgid = f[0] anno = '\t'.join(f[1:]) cpg_infor[cgid] = anno return (head, cpg_infor)
def read_grp_file2(gfile): ''' read group file. Group file define the biological groups of data matrix file. (1) It must has header (2) It must have at least two columns: * 1st column: sample names. samples names should be unique, and they must be exactly the same as the first row of beta matrix file. * 2nd column: group IDs. * additional columns can be included to indicate co-variables. (3) columns must be separated by "," For example: sampleID,survival,Sex Normal_1,1,1 Normal_2,1,2 Normal_3,1,1 Tumor_1,2,1 Tumor_2,2,2 Tumor_3,2,1 ... ... ''' samples = [] covar_values = [] covar_names = [] covars = collections.defaultdict(dict) line_num = 0 for l in ireader.reader(gfile): l = l.replace(' ','') line_num += 1 f = l.split(',') if len(f) < 2: print ("Group fle has at lesat 2 columns!", file=sys.stderr) sys.exit(1) if line_num == 1: covar_names = f[1:] else: sample_id = f[0] samples.append(sample_id) covar_values = f[1:] for a,b in zip(covar_names, covar_values): covars[a][sample_id] = b tmp = collections.Counter(samples) if tmp.most_common(1)[0][1] > 1: print ("Sample names are not unique!", file=sys.stderr) sys.exit(0) return(samples, covar_names, covars)
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input CpG file in BED3+ format.") parser.add_option("-a","--annotation",action="store",type="string",dest="anno_file",help="Input annotation file in BED3+ format.") parser.add_option("-w","--window",action="store",type='int', dest="window_size", default=100, help="Size of window centering on the middle-point of each genomic region defined in the annotation BED file (i.e., window_size*0.5 will be extended to up- and down-stream from the middle point of each genomic region). default=%default" ) parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.") parser.add_option("-l", "--header", action="store_true", dest="header", default=False, help="If True, the first row of input CpG file is header. default=%default") (options,args)=parser.parse_args() if not (options.input_file): print (__doc__) #print ('You must specify input file(s)',file=sys.stderr) parser.print_help() sys.exit(101) if not (options.out_file): print (__doc__) #print ('You must specify the output file',file=sys.stderr) parser.print_help() sys.exit(102) if not (options.anno_file): print (__doc__) #print ('You must specify the annotation file',file=sys.stderr) parser.print_help() sys.exit(103) tree = buildIntervalTree(options.anno_file, window_size = options.window_size) OUT = open(options.out_file + '.anno.txt','w') line_num = 0 printlog("Reading CpG file: %s ..." % options.input_file) for line in ireader.reader(options.input_file): fields = line.rstrip('\n ').split() if len(fields) < 3: continue line_num += 1 f = line.split() if (line_num == 1 and options.header): print (line + '\t' + basename(options.anno_file), file=OUT) else: chrom = f[0] start = int(f[1]) end = int(f[2]) overlaps = findIntervals(chrom, start, end, tree) if len(overlaps) > 0: print (line + '\t' + ','.join(overlaps), file=OUT) else: print (line + '\tN/A', file=OUT) OUT.close()
def buildIntervalTree(bed_file, window_size=0): ''' Build interval tree from annotation BED file. window : add this to the middle of each region. ''' ranges = {} printlog("Build interval tree from annotation file: %s ..." % bed_file) for line in ireader.reader(bed_file): if line.startswith("track"): continue if line.startswith("#"): continue if line.startswith('browser'): continue fields = line.rstrip('\n ').split() if len(fields) < 3: continue chrom = fields[0] start = int(fields[1]) end = int(fields[2]) if window_size > 0: # window middle position mid = int(start + (end - start) / 2.0) if start < 0: continue if end < 0: continue if start > end: continue # window start position extension = int(window_size * 0.5) w_start = mid - extension if w_start < start: w_start = start # window end position w_end = mid + extension if w_end > end: w_end = end if len(fields) >= 4: name = fields[3] else: name = fields[0] + ':' + fields[1] + '-' + fields[2] if chrom not in ranges: ranges[chrom] = Intersecter() ranges[chrom].add_interval(Interval(start, end, value=name)) else: ranges[chrom].add_interval(Interval(start, end, value=name)) return ranges
def read_CpG_bed(cpgfile): ''' cpgfile: CpG BED file should have at least 3 columns (Chrom, chromStart, chromEnd). Note: chromEnd correspond to the genomic position methylated C. beta value is placed at the 4th column, if there is no 4th column (or the 4th column is not a number), beta set to 1. Additional columns are ignored. ''' cpg_ranges = {} for l in ireader.reader(cpgfile): if l.startswith('#'): continue if l.startswith('track'): continue if l.startswith('browser'): continue f = l.split() if len(f) < 3: print("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr) continue chrom = f[0] start = int(f[1]) end = int(f[2]) if start > end: print("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr) continue try: beta = float(f[4]) except: beta = 1.0 try: strand = f[5] except: strand = '+' if chrom not in cpg_ranges: cpg_ranges[chrom] = IntervalTree() if strand == '+': cpg_ranges[chrom].insert_interval(Interval(start, end, value=beta)) elif strand == '-': cpg_ranges[chrom].insert_interval( Interval(end, end + 1, value=beta)) return cpg_ranges
def getBasalDomains(bedfile, printit=False): ''' Define gene's basal regulatory domain. bedfile: one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript. ''' basal_ranges = {} for l in ireader.reader(bedfile): if l.startswith('#'): continue if l.startswith('track'): continue if l.startswith('browser'): continue f = l.split() try: chrom = f[0] start = int(f[1]) end = int(f[2]) symbol = f[3] gene_strand = f[5] if start > end: print("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr) continue if gene_strand not in ['+', '-']: print("Invalid strand. Skip: " + l, file=sys.stderr) continue except: print("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr) continue if chrom not in basal_ranges: basal_ranges[chrom] = IntervalTree() basal_ranges[chrom].insert_interval( Interval(start, end, strand=gene_strand, value=symbol)) if printit: print('\t'.join([ str(i) for i in (chrom, basal_st, basal_end, symbol, '0', strand) ]), file=sys.stdout) return basal_ranges
def getCDSExons(self, uniquify=True, stranded=True): ''' Get only CDS exon regions from BED-12 file. Both 5' and 3' UTR parts are removed. uniquify: if the returned blocks should be uniquify. ''' reblocks = [] for l in ireader.reader(self.f): l = l.strip() if l.startswith(('#', 'track', 'browser')): continue f = l.split() if len(f) < 12: print( "\nInput error!\nStandard BED format has 12 columns.\n%s" % (BED12), file=sys.stderr) sys.exit(1) chrom = f[0] chrom_start = int(f[1]) name = f[4] strand = f[5] cdsStart = int(f[6]) cdsEnd = int(f[7]) blockCount = int(f[9]) blockSizes = [int(i) for i in f[10].strip(',').split(',')] blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] cds_exons = [] genome_seq_index = [] for base, offset in zip(blockStarts, blockSizes): if (base + offset) < cdsStart: continue if base > cdsEnd: continue exon_start = max(base, cdsStart) exon_end = min(base + offset, cdsEnd) if stranded: reblocks.append((chrom, exon_start, exon_end, strand)) else: reblocks.append((chrom, exon_start, exon_end)) #self.f.seek(0) if uniquify: return list(set(reblocks)) else: return reblocks
def read_chromSize(infile): ''' read chromosome size file (tab/space separated plain text file). chr1 249250621 chr2 243199373 chr3 198022430 chr4 191154276 ''' names = [] sizes = [] for l in ireader.reader(infile): if l.startswith('#'): continue f = l.split() if len(f) != 2: continue names.append(f[0]) sizes.append(int(f[1])) return (names, sizes)
def load_data(infile): """ Input file is tab or space separated plain text file. *The first row contains sample IDs (must be unique) *The first column contains probe IDs (must be unique) *Each cell (except for the 1st row and 1st column) contains Beta-value Example: Probe sample_1 sample_2 sample_3 ... cg09835024 0.0547 0.1187 0.0625 ... cg25813447 0.428 0.3746 0.0666 ... cg07779434 0.3713 0.4194 0.0493 ... ... ... ... """ printlog("Reading " + infile + " ...") line_num = 0 beta_values = collections.defaultdict(list) for l in ireader.reader(infile): line_num += 1 if line_num == 1: sample_IDs = l.split()[1:] column_num = len(sample_IDs) continue else: f = l.split() beta_values['probeID'].append(f[0]) tmp = list(map(float, f[1:])) if len(tmp) != column_num: print >> sys.stderr, "The number of columns of this row does NOT match with header row's" print >> sys.stderr, l continue for sid, beta in zip(sample_IDs, tmp): beta_values[sid].append(beta) print("\tTotal samples: %d" % (len(beta_values.keys()) - 1), file=sys.stderr) print("\tTotal probes: %d" % (line_num - 1), file=sys.stderr) return beta_values
def read_grp_file1(gfile): ''' read group file. Group file define the biological groups of data matrix file. (1) It must has header (2) It must have two columns: * 1st column: sample names. samples names should be unique, and they must be exactly the same as the first row of beta matrix file. * 2nd column: group IDs. (3) columns must be separated by "," For example: sampleID,groupID Normal_1,1 Normal_2,1 Normal_3,1 Tumor_1,2 Tumor_2,2 Tumor_3,2 ''' samples = [] groups = [] line_num = 0 for l in ireader.reader(gfile): l = l.replace(' ', '') line_num += 1 f = l.split(',') if len(f) < 2: print("Group fle must have 2 columns!", file=sys.stderr) sys.exit(1) if line_num == 1: continue else: samples.append(f[0]) groups.append(f[1]) tmp = collections.Counter(samples) if tmp.most_common(1)[0][1] > 1: print("Sample names are not unique!", file=sys.stderr) sys.exit(0) return (samples, groups)
def getIntergenic(self, direction='up', size=2000, uniquify=True, stranded=True): '''get intergenic regions. direction=up or down or both.''' reblocks = [] for l in ireader.reader(self.f): l = l.strip() if l.startswith(('#', 'track', 'browser')): continue f = l.split() chrom = f[0] tx_start = int(f[1]) tx_end = int(f[2]) strand = f[5] if (direction == "up" or direction == "both"): if strand == '-': region_st = tx_end region_end = tx_end + size else: region_st = max(tx_start - size, 0) region_end = tx_start reblocks.append((chrom, region_st, region_end, strand)) if (direction == "down" or direction == "both"): if strand == '-': region_st = max(0, tx_start - size) region_end = tx_start else: region_st = tx_end region_end = tx_end + size if stranded: reblocks.append((chrom, region_st, region_end, strand)) else: reblocks.append((chrom, region_st, region_end)) #self.f.seek(0) if uniquify: return list(set(reblocks)) else: return reblocks
def getExons(self, uniquify=True, stranded=True): ''' Get all exons (including both coding exons and UTR exons) from BED-12 file. uniquify: if the returned blocks should be uniquify. ''' reblocks = [] for l in ireader.reader(self.f): l = l.strip() if l.startswith(('#', 'track', 'browser')): continue f = l.split() if len(f) < 12: print("Standard BED format has 12 columns.\n%s" % (BED), file=sys.stderr) sys.exit(1) chrom = f[0] chrom_start = int(f[1]) name = f[4] strand = f[5] cdsStart = int(f[6]) cdsEnd = int(f[7]) blockCount = int(f[9]) blockSizes = [int(i) for i in f[10].strip(',').split(',')] blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] for base, offset in zip(blockStarts, blockSizes): if stranded: reblocks.append((chrom, base, base + offset, strand)) else: reblocks.append((chrom, base, base + offset)) #self.f.seek(0) if uniquify: return list(set(reblocks)) else: return reblocks
def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input_file", action="store", type="string", dest="input_file", help= "Input CpG file in BED format. The first 3 columns contain \"Chrom\", \"Start\", and \"End\". The 4th column contains proportion values." ) parser.add_option( "-a", "--alpha", action="store", type='float', dest="alpha_cut", default=0.05, help= "The chance of mistakingly assign a particular CpG as an outlier for each genomic region. default=%default" ) parser.add_option("-b", "--bed", action="store", type="string", dest="bed_file", help="BED3+ file specifying the genomic regions.") parser.add_option("-o", "--output", action="store", type='string', dest="out_file", help="The prefix of the output file.") (options, args) = parser.parse_args() if not (options.input_file): print(__doc__) parser.print_help() sys.exit(101) if not (options.out_file): print(__doc__) parser.print_help() sys.exit(103) if options.alpha_cut < 0: options.alpha_cut = 0.05 if options.alpha_cut > 1: options.alpha_cut = 1 tree = buildIntervalTree(options.input_file) OUT = open(options.out_file, 'w') print( "#chrom\tstart\tend\tN_CpG_filtered\tN_methyl_filtered\tN_total_filtered\tN_CpG_ori\tN_methy_ori\tN_total_ori", file=OUT) for line in ireader.reader(options.bed_file): line = line.strip() if line.startswith("track"): continue if line.startswith("#"): continue if line.startswith('browser'): continue if line.startswith('Chrom'): continue f = line.split() if len(f) < 3: continue try: chrom = f[0] start = int(f[1]) end = int(f[2]) except: continue a = findIntervals(chrom, start, end, tree, a=options.alpha_cut) if len(a) == 0: print('\t'.join(f[0:3]) + '\t' + '\t'.join(['N/A'] * 6), file=OUT) else: print('\t'.join(f[0:3]) + '\t' + '\t'.join([str(i) for i in a]), file=OUT) OUT.close()
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.") parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".") parser.add_option("-f","--family",action="store",type="int",dest="family_func",default=1, help="Error distribution and link function to be used in the GLM model. Can be integer 1 or 2 with 1 = \"quasibinomial\" and 2 = \"binomial\". Default=%default.") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.") (options,args)=parser.parse_args() print () if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.group_file): print (__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) if not os.path.isfile(options.input_file): print ("Input data file \"%s\" does not exist\n" % options.input_file) sys.exit(104) if not os.path.isfile(options.group_file): print ("Input group file \"%s\" does not exist\n" % options.input_file) sys.exit(105) ROUT = open(options.out_file + '.r','w') family = {1:'quasibinomial', 2:'binomial',} if not options.family_func in family.keys(): print ("Incorrect value of '-f'!") sys.exit(106) printlog("Read group file \"%s\" ..." % (options.group_file)) (samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file) for cv_name in cv_names: print ("%s: %s" % (cv_name, v_types[cv_name])) for sample in samples: print ('\t' + sample + '\t' + cvs[cv_name][sample]) primary_variable = cv_names[0] print ('lrf1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT) print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT) print ('pvals <- coef(summary(fit))[,4]', file=ROUT) print ('coefs <- coef(summary(fit))[,1]', file=ROUT) print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT) print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT) print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(gsub("2","",names(coefs)), "coef",sep="."), paste(gsub("2","",names(pvals)), "pval",sep=".")))' % (options.out_file + '.results.txt'), file = ROUT) print ('}', file=ROUT) print ('\n', file=ROUT) print ('lrf2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT) print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT) print ('pvals <- coef(summary(fit))[,4]', file=ROUT) print ('coefs <- coef(summary(fit))[,1]', file=ROUT) print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT) print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT) print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append = TRUE)' % (options.out_file + '.results.txt'), file = ROUT) print ('}', file=ROUT) print ('\n', file=ROUT) printlog("Processing file \"%s\" ..." % (options.input_file)) line_num = 0 probe_list = [] p_list = [] for l in ireader.reader(options.input_file): line_num += 1 f = l.split() if len(f) == 0: continue if line_num == 1: sample_IDs = f[1:] # check if sample ID matches for s in samples: if s not in sample_IDs: printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file)) sys.exit(3) #for cv_name in cv_names: # print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT) for cv_name in cv_names: if v_types[cv_name] == 'continuous': print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT) elif v_types[cv_name] == 'categorical': print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT) else: printlog("unknown vaiable type!") sys.exit(1) print ('\n', file=ROUT) continue else: methyl_reads = [] # c total_reads = [] # n cg_id = f[0] for i in f[1:]: #try: m = re.match(r'(\d+)\s*\,\s*(\d+)', i) if m is None: methyl_reads.append("NaN") total_reads.append("NaN") continue else: c = int(m.group(1)) n = int(m.group(2)) if n >= c and n > 0: methyl_reads.append(c) total_reads.append(n) else: printlog("Incorrect data format!") print (f) sys.exit(1) if line_num == 2: print ('lrf1(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT) else: print ('lrf2(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT) ROUT.close() try: printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r')) subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True) except: print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr) sys.exit(1) # read printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...") line_num = 0 p_list = [] probe_list = [] for l in open(options.out_file + '.results.txt', 'r'): l = l.strip() line_num += 1 if line_num == 1: headers = l.split() primary_v_index = headers.index(primary_variable + '.pval') else: v = l.split() try: pv = float(v[primary_v_index]) except: continue if pv >= 0 and pv <= 1: p_list.append(pv) probe_list.append(v[0]) # adjust q_list = padjust.multiple_testing_correction(p_list) # write adjusted_p = {} for id,p,q in zip(probe_list, p_list, q_list): adjusted_p[id] = '\t'.join([str(i) for i in (p,q)]) FOUT = open(options.out_file + '.pval.txt','w') printlog("Writing to %s" % (options.out_file + '.pval.txt')) line_num = 1 for l in ireader.reader(options.input_file): if line_num == 1: print (l + '\tpval\tadj.pval', file=FOUT) else: f = l.split() probe_ID = f[0] try: print (l + '\t' + adjusted_p[probe_ID], file=FOUT) except: print (l + '\tNaN\tNaN', file=FOUT) line_num += 1 FOUT.close()
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be regular text file or compressed file (*.gz, *.bz2) or accessible url.") parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file define the biological groups of each samples as well as other covariables such as gender, age. Sample IDs shoud match to the \"Data file\".") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of output file.") (options,args)=parser.parse_args() print () if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.group_file): print (__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) if not os.path.isfile(options.input_file): print ("Input data file \"%s\" does not exist\n" % options.input_file) sys.exit(104) if not os.path.isfile(options.group_file): print ("Input group file \"%s\" does not exist\n" % options.input_file) sys.exit(105) FOUT = open(options.out_file + '.pval.txt','w') ROUT = open(options.out_file + '.r','w') printlog("Read group file \"%s\" ..." % (options.group_file)) (samples,cv_names, cvs) = read_grp_file2(options.group_file) for cv_name in cv_names: print (cv_name) for sample in samples: print ('\t' + sample + '\t' + cvs[cv_name][sample]) printlog("Processing file \"%s\" ..." % (options.input_file)) line_num = 0 probe_list = [] p_list = [] for l in ireader.reader(options.input_file): line_num += 1 f = l.split() if line_num == 1: sample_IDs = f[1:] # check if sample ID matches for s in samples: if s not in sample_IDs: printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file)) sys.exit(3) continue else: beta_values = [] cg_id = f[0] for i in f[1:]: try: beta_values.append(float(i)) except: beta_values.append("NaN") print ('',file=ROUT) print ('cgid <- \"!%s\"' % cg_id, file=ROUT) print ("y <- c(%s)" % (','.join([str(beta) for beta in beta_values])), file=ROUT) #response variable for cv_name in cv_names: print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT) print ('try(fit <- glm(y ~ %s, family=gaussian))' % ('+'.join(cv_names)), file = ROUT) print ('pval <- coef(summary(fit))[,4]',file=ROUT) print ('coef <- coef(summary(fit))[,1]',file=ROUT) print ('cat(cgid, names(pval),pval,coef, sep="\\t")', file=ROUT) print ('cat("\\n")', file=ROUT) ROUT.close() try: printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r')) subprocess.call("Rscript %s >%s 2>%s" % (options.out_file + '.r', options.out_file + '.r.results.txt',options.out_file + '.r.warnings.txt' ), shell=True) except: print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr) sys.exit(1) printlog("Reading file \"%s\" ..." % (options.out_file + '.r.results.txt')) glm_results = {} for l in open(options.out_file + '.r.results.txt'): l = l.strip() if not l.startswith('!'):continue l = l.replace(')','') l = l.replace('(','') f = l.split('\t') cgID = f[0].replace('!','') tmp = f[1:] if len(tmp)%3 == 0: chunk_size = int(len(tmp)/3) sub_lists = [tmp[i:i+chunk_size] for i in range(0,len(tmp),chunk_size)] v_names = sub_lists[0][1:] v_pvals = sub_lists[1][1:] v_coefs = sub_lists[2][1:] glm_results[cgID] = [v_coefs, v_pvals] else: glm_results[cgID] = [["NaN"]* len(cv_names), ["NaN"]* len(cv_names)] printlog("Results saved to \"%s\" ..." % (options.out_file + '.pval.txt')) line_num = 0 for l in ireader.reader(options.input_file): line_num += 1 f = l.split() if line_num == 1: print (l + '\t' + '\t'.join([i + '.coef' for i in v_names]) + '\t' + '\t'.join([i + '.pval' for i in v_names]), file=FOUT) else: cgID = f[0] print (l + '\t' + '\t'.join(glm_results[cgID][0]) + '\t' + '\t'.join(glm_results[cgID][1]), file=FOUT) FOUT.close()
def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input-file", action="store", type="string", dest="input_file", help= "Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url." ) parser.add_option( "-g", "--group", action="store", type="string", dest="group_file", help= "Group file define the biological groups of each samples. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs shoud match to the \"Data file\". Note: automatically switch to use ANOVA if more than 2 groups were defined in this file." ) parser.add_option("-o", "--output", action="store", type='string', dest="out_file", help="Prefix of output file.") (options, args) = parser.parse_args() print() #print (options.paired) #print (options.welch_ttest) if not (options.input_file): print(__doc__) parser.print_help() sys.exit(101) if not (options.group_file): print(__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print(__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file + '.pval.txt', 'w') #ROUT = open(options.out_file + '.r','w') printlog("Read group file \"%s\" ..." % (options.group_file)) (s, g) = read_grp_file1(options.group_file) s2g = dict(zip(s, g)) g2s = collections.defaultdict(list) for k, v in s2g.items(): g2s[v].append(k) group_IDs = sorted(g2s.keys()) for g in group_IDs: print("\tGroup %s has %d samples:" % (g, len(g2s[g]))) print('\t\t' + ','.join(g2s[g])) if len(group_IDs) != 2: printlog("You must have two groups!", file=sys.stderr) sys.exit(1) line_num = 1 probe_list = [] p_list = [] or_list = [] for l in ireader.reader(options.input_file): f = l.split() if line_num == 1: sample_IDs = f[1:] # check if sample ID matches for s in s2g: if s not in sample_IDs: printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file)) sys.exit(3) else: cg_id = f[0] probe_list.append(cg_id) proportions = f[1:] methyl_reads = 0 unmethyl_reads = 0 g2values = collections.defaultdict(dict) for g in group_IDs: g2values[g]['methyl'] = 0 g2values[g]['unmethyl'] = 0 for s, p in zip(sample_IDs, proportions): gid = s2g[s] m = re.match(r'(\d+)\s*\,\s*(\d+)', p) if m is None: continue else: c = int(m.group(1)) n = int(m.group(2)) if n >= c and n > 0: g2values[gid]['methyl'] += c g2values[gid]['unmethyl'] += (n - c) else: printlog("Incorrect data format!") print(f) sys.exit(1) (odds, pval) = stats.fisher_exact([[ g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl'] ], [ g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl'] ]]) #print (g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl'],g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl']) p_list.append(pval) or_list.append(odds) line_num += 1 printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...") adjusted_p = {} q_list = padjust.multiple_testing_correction(p_list) for id, o, p, q in zip(probe_list, or_list, p_list, q_list): adjusted_p[id] = '\t'.join([str(i) for i in (o, p, q)]) printlog("Writing to %s" % (options.out_file + '.pval.txt')) line_num = 1 for l in ireader.reader(options.input_file): if line_num == 1: print(l + '\tOddsRatio\tpval\tadj.pval', file=FOUT) else: f = l.split() probe_ID = f[0] print(l + '\t' + adjusted_p[probe_ID], file=FOUT) line_num += 1 FOUT.close()
def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input-file", action="store", type="string", dest="input_file", help= "BED3+ file specifying the C position. BED3+ file could be a regular text file or compressed file (*.gz, *.bz2) or accessible url. [required]" ) parser.add_option( "-r", "--refgene", action="store", type="string", dest="gene_file", help= "Reference gene model in BED12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). It is recommended that multiple transcripts of the same gene are collapsed into a single super transcript with one TSS and one name." ) parser.add_option( "-u", "--basal-up", action="store", type="int", dest="basal_up_size", default=5000, help= "Size of extension to upstream of TSS (used to define gene's \"basal regulatory domain\"). default=%default (bp)" ) parser.add_option( "-d", "--basal-down", action="store", type="int", dest="basal_down_size", default=1000, help= "Size of extension to downstream of TSS (used to define gene's basal regulatory domain). default=%default (bp)" ) parser.add_option( "-e", "--extension", action="store", type="int", dest="extension_size", default=1000000, help= "Size of extension to both up- and down-stream of TSS (used to define gene's \"extended regulatory domain\"). default=%default (bp)" ) parser.add_option( "-o", "--output", action="store", type='string', dest="out_file", help= "Prefix of output file. Two addtional columns will be appended to the orignal BED file with the last column indicating \"genes whose extended regulatory domain are overlapped with the CpG\", the 2nd last column indicating \"genes whose basal regulatory domain are overlapped with the CpG\". [required]" ) (options, args) = parser.parse_args() print() if not (options.input_file): #print ('You must specify input file(s)',file=sys.stderr) print(__doc__) parser.print_help() sys.exit(101) if not (options.gene_file): #print ('You must specify the chrom size file',file=sys.stderr) print(__doc__) parser.print_help() sys.exit(102) if not (options.out_file): #print ('You must specify the output file',file=sys.stderr) print(__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file + '.annotatio.txt', 'w') printlog("Calculate basal regulatory domain from: \"%s\" ..." % (options.gene_file)) basal_domains = getBasalDomains(bedfile=options.gene_file, up=options.basal_up_size, down=options.basal_down_size, printit=False) printlog("Calculate extended regulatory domain from: \"%s\" ..." % (options.gene_file)) extended_domains = geteExtendedDomains(basal_ranges=basal_domains, bedfile=options.gene_file, up=options.basal_up_size, down=options.basal_down_size, ext=options.extension_size, printit=False) overlap = extended_domains['chr1'].find(2161048, 2161049) printlog("Assigning CpG to gene ...") for l in ireader.reader(options.input_file): if l.startswith('#'): print(l, file=FOUT) continue if l.startswith('track'): continue if l.startswith('browser'): continue try: f = l.split() chrom = f[0] start = int(f[1]) end = int(f[2]) except: print("Invalid BED line: %s" % l, file=sys.stderr) continue basal_genes = set() #genes whose basal domain is overlapped with CpG if chrom not in basal_domains: basal_genes.add('//') else: overlaps = basal_domains[chrom].find(start, end) if len(overlaps) == 0: basal_genes.add('//') else: for o in overlaps: basal_genes.add(o.value) extend_genes = set( ) #genes whose extended domain is overlapped with CpG if chrom not in extended_domains: extend_genes.add('//') else: overlaps = extended_domains[chrom].find(start, end) if len(overlaps) == 0: extend_genes.add('//') else: for o in overlaps: extend_genes.add(o.value) extend_genes = extend_genes - basal_genes if len(extend_genes) == 0: extend_genes.add('//') print(l + '\t' + ';'.join(basal_genes) + '\t' + ';'.join(extend_genes), file=FOUT) FOUT.close()
print ("\tGroup %s has %d samples:" % (g, len(g2s[g])), file=sys.stderr) print ('\t\t' + ','.join(g2s[g]), file=sys.stderr) if len(group_IDs) != 2: printlog("You must have two groups!", file=sys.stderr) sys.exit(1) manager = Manager() results = manager.list() #list of list. shared variable between main() and beta_bayes(). #ID, group1.mean, group2.mean, prob printlog("Read data file \"%s\" ..." % (options.input_file)) line_num = 0 p_count = 0 jobs = [] for l in ireader.reader(options.input_file): line_num += 1 f = l.split() if len(f) == 0: continue if line_num == 1: sample_IDs = f[1:] # check if sample ID matches for s in s2g: if s not in sample_IDs: printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file)) sys.exit(3) g_IDs = [s2g[i] for i in sample_IDs] else: probe_ID = f[0] p_count += 1
def main(): print(__doc__) usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input_file", action="store", type="string", dest="input_file", help= "BED file specifying the C position. This BED file should have at least six columns (Chrom, ChromStart, ChromeEnd, name, score, strand). Note: Must provide correct *strand* information. This file can be a regular text file or compressed file (.gz, .bz2)." ) parser.add_option( "-r", "--refgenome", action="store", type="string", dest="genome_file", help= "Reference genome seqeunces in FASTA format. Must be indexed using the samtools \"faidx\" command. " ) parser.add_option( "-e", "--extend", action="store", type="int", dest="extend_size", default=5, help= "Number of bases extended to up- and down-stream. default=%default (bp)" ) parser.add_option("-n", "--name", action="store", type='string', dest="motif_name", default='motif', help="Motif name. default=%default") parser.add_option("-o", "--output", action="store", type='string', dest="out_file", help="The prefix of the output file.") (options, args) = parser.parse_args() print() if not (options.input_file): parser.print_help() sys.exit(101) if not (options.genome_file): parser.print_help() sys.exit(102) #index refegenome file if it hasn't been done if not os.path.exists(options.genome_file + '.fai'): printlog("Creating index for %s" % options.genome_file) pysam.faidx(options.genome_file) if not (options.out_file): parser.print_help() sys.exit(103) refFasta = pysam.Fastafile(options.genome_file) FOUT = open(options.out_file + '.fa', 'w') printlog("Reading %s ..." % options.input_file) for l in ireader.reader(options.input_file): if l.startswith('#'): continue if l.startswith('track'): continue if l.startswith('browser'): continue f = l.split() if '-' in f: strand = '-' else: strand = '+' try: chrom = f[0] position = int(f[2]) except: print("BED has at lesat 4 columns. Skip: " + l, file=sys.stderr) start = position - options.extend_size - 1 end = position + options.extend_size if start < 0 or start > end: continue fa_name = '>' + '_'.join([str(i) for i in (chrom, start, end, strand)]) fa_seq = refFasta.fetch(chrom, start, end).upper() if strand == '-': fa_seq = revcomp(fa_seq) print(fa_name, file=FOUT) print(fa_seq, file=FOUT) FOUT.close() printlog("Generate motif logo ... ") try: subprocess.call( "weblogo --format PDF -D fasta -c classic -s large -f %s -o %s -t %s" % (options.out_file + '.fa', options.out_file + '.logo.pdf', options.motif_name), shell=True) subprocess.call( "weblogo --format PNG -D fasta -c classic -s large -f %s -o %s -t %s" % (options.out_file + '.fa', options.out_file + '.logo.png', options.motif_name), shell=True) except: print( "Cannot run weblogo. Please install weblogo (https://github.com/WebLogo/weblogo)", file=sys.stderr) pass printlog("Motif logo saved to \"%s\" and \"%s\"" % (options.out_file + '.logo.pdf', options.out_file + '.logo.png')) m = PSSM(sites=options.out_file + '.fa', name=options.motif_name) printlog("Write position frequency matrix (PFM) to \"%s\"" % (options.out_file + '.pfm')) FF = open(options.out_file + '.pfm', 'w') m.toPFM(FOUT=FF) FF.close() printlog("Write position probability matrix (PPM) to \"%s\"" % (options.out_file + '.ppm')) FF = open(options.out_file + '.ppm', 'w') m.toPPM(FOUT=FF) FF.close() printlog("Write position weight matrix (PWM) to \"%s\"" % (options.out_file + '.pwm')) FF = open(options.out_file + '.pwm', 'w') m.toPWM(FOUT=FF) FF.close() printlog("Write Jaspar format matrix to \"%s\"" % (options.out_file + '.jaspar')) FF = open(options.out_file + '.jaspar', 'w') m.toJaspar(FOUT=FF) FF.close() printlog("Write MEME format matrix to \"%s\"" % (options.out_file + '.meme')) FF = open(options.out_file + '.meme', 'w') m.toMEME(FOUT=FF) FF.close()
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).") parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated two columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use Kruskal-Wallis H-test if more than two groups were defined in this file.") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.") (options,args)=parser.parse_args() print () if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.group_file): print (__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file + '.pval.txt','w') printlog("Read group file \"%s\" ..." % (options.group_file)) (s,g) = read_grp_file1(options.group_file) s2g = dict(zip(s,g)) g2s = collections.defaultdict(list) for k,v in s2g.items(): g2s[v].append(k) group_IDs = sorted(g2s.keys()) for g in group_IDs: print ("\tGroup %s has %d samples:" % (g, len(g2s[g]))) print ('\t\t' + ','.join(g2s[g])) if len(group_IDs) < 2: printlog("You must have at least two groups!", file=sys.stderr) sys.exit(1) elif len(group_IDs) == 2: printlog("Perfrom Mann-Whitney rank test of two samples ...") elif len(group_IDs) >= 3: printlog("Perfrom Kruskal-Wallis H-test ...") line_num = 1 probe_list = [] p_list = [] for l in ireader.reader(options.input_file): f = l.split() if len(f) == 0: continue if line_num == 1: sample_IDs = f[1:] # check if sample ID matches for s in s2g: if s not in sample_IDs: printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file)) sys.exit(3) else: g2values = collections.defaultdict(list) probe_ID = f[0] beta_values = f[1:] for s,b in zip(sample_IDs, beta_values): #deal with non-numerical values try: b = float(b) except: b = np.nan #skip if s not in group file if s not in s2g: continue gid = s2g[s] g2values[gid].append(b) if len(g2values) == 2: a = np.array(g2values[group_IDs[0]]) b = np.array(g2values[group_IDs[1]]) (pval,tscore) = mwu_test(a,b) elif len(g2values) >= 3: tmp = [] for g in group_IDs: tmp.append(np.array(g2values[g])) (pval,tscore) = kruskal_test(*tmp) probe_list.append(probe_ID) p_list.append(pval) line_num += 1 printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...") adjusted_p = {} q_list = padjust.multiple_testing_correction(p_list) for id,p,q in zip(probe_list, p_list, q_list): adjusted_p[id] = '\t'.join([str(i) for i in (p,q)]) printlog("Writing to %s" % (options.out_file + '.pval.txt')) line_num = 1 for l in ireader.reader(options.input_file): if line_num == 1: print (l + '\tpval\tadj.pval', file=FOUT) else: f = l.split() probe_ID = f[0] print (l + '\t' + adjusted_p[probe_ID], file=FOUT) line_num += 1 FOUT.close()
def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input_file", action="store", type="string", dest="input_file", help= "Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (.gz, .bz2)" ) parser.add_option( "-g", "--group", action="store", type="string", dest="group_file", help= "Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\"." ) parser.add_option("-o", "--output", action="store", type='string', dest="out_file", help="The prefix of the output file.") (options, args) = parser.parse_args() print() if not (options.input_file): print(__doc__) parser.print_help() sys.exit(101) if not (options.group_file): print(__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print(__doc__) parser.print_help() sys.exit(103) if not os.path.isfile(options.input_file): print("Input data file \"%s\" does not exist\n" % options.input_file) sys.exit(104) if not os.path.isfile(options.group_file): print("Input group file \"%s\" does not exist\n" % options.input_file) sys.exit(105) ROUT = open(options.out_file + '.r', 'w') print('library("aod")', file=ROUT) printlog("Read group file \"%s\" ..." % (options.group_file)) #### (samples, cv_names, cvs, v_types) = read_grp_file2(options.group_file) for cv_name in cv_names: print("%s: %s" % (cv_name, v_types[cv_name])) for sample in samples: print('\t' + sample + '\t' + cvs[cv_name][sample]) #### print('bbr1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT) print('\tdat <- data.frame(m=m, t=t, %s)' % ','.join(['='.join(i) for i in zip(cv_names, cv_names)]), file=ROUT) print( '\tfit <- betabin(cbind(m,t - m) ~ %s, ~1, link=c("logit"), data=na.omit(dat))' % '+'.join(cv_names), file=ROUT) print('\ttest <- summary(fit)', file=ROUT) print('\tcoefs <- test@Coef$Estimate', file=ROUT) print('\tpvals = test@Coef$"Pr(> |z|)"', file=ROUT) print('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT) print('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT) print('\tnames = row.names(test@Coef)', file=ROUT) print('\tnames = gsub("2","",names)', file=ROUT) print( '\twrite.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(names, "coef",sep="."), paste(names, "pval",sep=".")))' % (options.out_file + '.results.txt'), file=ROUT) print('}', file=ROUT) print('\n', file=ROUT) print('bbr2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT) print('\tdat <- data.frame(m=m, t=t, %s)' % ','.join(['='.join(i) for i in zip(cv_names, cv_names)]), file=ROUT) print( '\tfit <- betabin(cbind(m,t - m) ~ %s, ~1, link=c("logit"), data=na.omit(dat))' % '+'.join(cv_names), file=ROUT) print('\ttest <- summary(fit)', file=ROUT) print('\tcoefs <- test@Coef$Estimate', file=ROUT) print('\tpvals = test@Coef$"Pr(> |z|)"', file=ROUT) print('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT) print('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT) print('\tnames = row.names(test@Coef)', file=ROUT) print('\tnames = gsub("2","",names)', file=ROUT) print( '\twrite.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1), quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append=TRUE)' % (options.out_file + '.results.txt'), file=ROUT) print('}', file=ROUT) print('\n', file=ROUT) printlog("Processing file \"%s\" ..." % (options.input_file)) line_num = 0 probe_list = [] p_list = [] for l in ireader.reader(options.input_file): line_num += 1 f = l.split() if len(f) == 0: continue if line_num == 1: sample_IDs = f[1:] # check if sample ID matches for s in samples: if s not in sample_IDs: printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file)) sys.exit(3) #### for cv_name in cv_names: if v_types[cv_name] == 'continuous': print(cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs])), file=ROUT) elif v_types[cv_name] == 'categorical': print(cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs])), file=ROUT) else: printlog("unknown vaiable type!") sys.exit(1) #### print('\n', file=ROUT) continue else: methyl_reads = [] # c total_reads = [] # n cg_id = f[0] for i in f[1:]: #try: m = re.match(r'(\d+)\s*\,\s*(\d+)', i) if m is None: methyl_reads.append("NaN") total_reads.append("NaN") continue else: c = int(m.group(1)) n = int(m.group(2)) if n >= c and n > 0: methyl_reads.append(c) total_reads.append(n) else: printlog("Incorrect data format!") print(f) sys.exit(1) if line_num == 2: print( 'bbr1(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT) else: print( 'bbr2(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT) ROUT.close() try: printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r')) subprocess.call( "Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt'), shell=True) except: print("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr) sys.exit(1) """
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.") parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use ANOVA if more than 2 groups were defined in this file.") parser.add_option("-p","--paired",action="store_true",default=False,dest="paired",help="If '-p/--paired' flag was specified, use paired t-test which requires the equal number of samples in both groups. Paired sampels are matched by the order. This option will be ignored for multiple group analysis.") parser.add_option("-w","--welch",action="store_true",default=False,dest="welch_ttest",help="If '-w/--welch' flag was specified, using Welch's t-test which does not assume the two samples have equal variance. If omitted, use standard two-sample t-test (i.e. assuming the two samples have equal variance). This option will be ignored for paired t-test and multiple group analysis.") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.") (options,args)=parser.parse_args() print () #print (options.paired) #print (options.welch_ttest) if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.group_file): print (__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file + '.pval.txt','w') #ROUT = open(options.out_file + '.r','w') printlog("Read group file \"%s\" ..." % (options.group_file)) (ss,gs) = read_grp_file1(options.group_file) s2g = {} for s,g in zip(ss,gs): s2g[s] = g g2s = collections.defaultdict(list) for s,g in zip(ss, gs) g2s[g].append(s) group_IDs = sorted(g2s.keys()) for g in group_IDs: print ("\tGroup %s has %d samples:" % (g, len(g2s[g]))) print ('\t\t' + ','.join(g2s[g])) if len(group_IDs) < 2: printlog("You must have at least two groups!", file=sys.stderr) sys.exit(1) elif (len(group_IDs) == 2) and (options.paired is True): printlog("Perfrom paired t-test of two related samples ...") if len(g2s[group_IDs[0]]) != len(g2s[group_IDs[1]]): printlog("Unequal sample size. Cannot perform paired t-test.") sys.exit(2) elif (len(group_IDs) == 2) and (options.paired is False): printlog("Perfrom standard t-test of two independent samples ...") elif len(group_IDs) >= 3: printlog("Perfrom ANOVA ...") line_num = 1 probe_list = [] p_list = [] for l in ireader.reader(options.input_file): f = l.split() if line_num == 1: sample_IDs = f[1:] # check if sample ID matches for s in s2g: if s not in sample_IDs: printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file)) sys.exit(3) else: g2values = collections.defaultdict(list) probe_ID = f[0] beta_values = f[1:] for s,b in zip(sample_IDs, beta_values): #deal with non-numerical values try: b = float(b) except: b = np.nan #skip if s not in group file if s not in s2g: continue gid = s2g[s] g2values[gid].append(b) if len(g2values) == 2: a = np.array(g2values[group_IDs[0]]) b = np.array(g2values[group_IDs[1]]) if options.paired: (pval,tscore) = paired_ttest(a,b) else: (pval,tscore) = standard_ttest(a,b, equalVar = options.welch_ttest) elif len(g2values) >= 3: tmp = [] for g in group_IDs: tmp.append(np.array(g2values[g])) (pval,tscore) = anova(*tmp) probe_list.append(probe_ID) p_list.append(pval) line_num += 1 printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...") adjusted_p = {} q_list = padjust.multiple_testing_correction(p_list) for id,p,q in zip(probe_list, p_list, q_list): adjusted_p[id] = '\t'.join([str(i) for i in (p,q)]) printlog("Writing to %s" % (options.out_file + '.pval.txt')) line_num = 1 for l in ireader.reader(options.input_file): if line_num == 1: print (l + '\tpval\tadj.pval', file=FOUT) else: f = l.split() probe_ID = f[0] print (l + '\t' + adjusted_p[probe_ID], file=FOUT) line_num += 1 FOUT.close()
def geteExtendedDomains(basal_ranges, bedfile, up_ext=2000, down_ext=2000, min_gene = 200, printit = False): ''' Define gene's extended regulatory domain. bedfile:one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript. up_ext: Size of extension to upstream. Should be multiples of 100 down_ext: Size of extension to downstream. Should be multiples of 100 min_gene: minimum gene size (from TSS to TES). Should be multiples of 100 ''' return_ranges = [] for l in ireader.reader(bedfile): if l.startswith('#'): continue if l.startswith('track'): continue if l.startswith('browser'): continue f = l.split() try: chrom = f[0] start = int(f[1]) end = int(f[2]) symbol = f[3] strand = f[5] if start < 0:continue if start > end: print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr) continue if (end - start ) < min_gene: continue if strand not in ['+', '-']: print ("Unknown strand. Skip: " + l, file=sys.stderr) continue except: print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr) if strand == '+': extension_st = start - up_ext extension_end = end + down_ext elif strand == '-': extension_st = start - down_ext extension_end = end + up_ext if extension_st < 0: extension_st = 0 #try to update extension_st overlaps = basal_ranges[chrom].find(extension_st, start) if len(overlaps) > 0: for o in overlaps: if o.end > extension_st: extension_st = o.end if extension_st > start: extension_st = start if (start - extension_st) < min_gene: continue #try to update extension_end overlaps = basal_ranges[chrom].find(end, extension_end) if len(overlaps) > 0: for o in overlaps: if o.start < extension_end: extension_end = o.start if extension_end < end: extension_end = end if (extension_end - end) < min_gene: continue return_ranges.append(([chrom, extension_st, start,symbol], [chrom, start, end,symbol], [chrom, end, extension_end,symbol], strand)) #return_ranges.append(([chrom, extension_st, start, symbol], [chrom, start, end, symbol], [chrom, end, extension_end,symbol], strand)) #return_ranges.append(([chrom, extension_st, start, strand], [chrom, start, end, strand], [chrom, end, extension_end, strand])) if printit: print('\t'.join([str(i) for i in (chrom, extension_st, extension_end, symbol, '0', strand, start, end, '255,0,0', 1, extension_end - extension_st, 0)]), file = sys.stdout) return return_ranges