Exemplo n.º 1
0
def buildIntervalTree(bed_file):
    '''
	Build interval tree from BED file. Input BED file must have at least 4 columns
	'''
    ranges = {}
    printlog("reading " + bed_file + '...')
    for line in ireader.reader(bed_file):
        if line.startswith("track"): continue
        if line.startswith("#"): continue
        if line.startswith('browser'): continue
        if line.startswith('Chrom'): continue
        fields = line.rstrip('\n ').split()
        if len(fields) < 4:
            continue
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        score = fields[3]

        if start < 0:
            continue
        if end < 0:
            continue
        if start > end:
            continue

        if chrom not in ranges:
            ranges[chrom] = Intersecter()
            ranges[chrom].add_interval(Interval(start, end, value=score))
        else:
            ranges[chrom].add_interval(Interval(start, end, value=score))
    return ranges
Exemplo n.º 2
0
def chrom_count(infile):
    '''
	count chrom frequencies from BED file
	'''
    chrom_count = collections.defaultdict(int)
    for l in ireader.reader(infile):
        if l.startswith('#'):
            continue
        if l.startswith('track'):
            continue
        if l.startswith('browser'):
            continue
        f = l.split()
        if len(f) < 3:
            print("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
            continue
        try:
            start = int(f[1])
            end = int(f[2])
            if start > end:
                print("'Start' cannot be larger than 'End'. Skip: " + l,
                      file=sys.stderr)
                continue
        except:
            print("Not in valid BED format. Skip:" + l, file=sys.stderr)
            continue

        chrom_count[f[0]] += 1
    return chrom_count
Exemplo n.º 3
0
def read_bed_as_list(bedfile):
    '''
	bedfile file should have at least 3 columns (Chrom, chromStart, chromEnd).
	if no strand information found in the 6th column. All regions will be
	considered on "+" strand.
	'''
    lst = []
    for l in ireader.reader(bedfile):
        if l.startswith('#'):
            continue
        if l.startswith('track'):
            continue
        if l.startswith('browser'):
            continue
        f = l.split()

        try:
            chrom = f[0]
            start = int(f[1])
            end = int(f[2])
            if start > end:
                print("'Start' cannot be larger than 'End'. Skip: " + l,
                      file=sys.stderr)
                continue
        except:
            print("BED has at lesat 3 columns. Skip: " + l, file=sys.stderr)
        lst.append([chrom, start, end])
    return lst
Exemplo n.º 4
0
def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="BED6 file specifying the C position. This BED file should have at least 6 columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand).  Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
	parser.add_option("-r","--region",action="store",type="string",dest="region_file",help="BED file specificy genomic regions. This BED file should have at least 3 columns (Chrom, ChromStart, ChromeEnd).")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of output file.")
	(options,args)=parser.parse_args()
	
	print ()

	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.region_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	FOUT = open(options.out_file + '.txt','w')
	
	#step1: read CpG file
	printlog("Reading CpG file: \"%s\"" % (options.input_file))
	cpg_ranges = read_CpG_bed(options.input_file)
		
	#step2: read region file
	printlog("Reading BED file: \"%s\"" % (options.region_file))
	
	printlog("Writing to: \"%s\"" % (options.out_file + '.txt'))
	region_list = []
	for l in ireader.reader(options.region_file):
		if l.startswith('#'):
			continue
		if l.startswith('track'):
			continue
		if l.startswith('browser'):
			continue
		f = l.split()
		if len(f) < 3:
			continue
		try:
			chrom = f[0]
			st = int(f[1])
			end = int(f[2])
		except:
			print (l + '\t' + '\t'.join(['NA']*6, file=FOUT))
			continue
		tmp = stats_over_range(cpg_ranges, chrom, st, end)
		print (l + '\t' + '\t'.join([str(i) for i in tmp]), file=FOUT)		
	
	FOUT.close()
Exemplo n.º 5
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta or M values with the 1st row containing sample IDs and the 1st column containing CpG IDs. This file can be a regular text file or compressed file (.gz, .bz2).")
	parser.add_option("-d","--dtype",action="store",type='string', dest="data_type",default="Beta", help="Input data type either \"Beta\" or \"M\". default=%default")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The output file.")
	(options,args)=parser.parse_args()
	
	print ()

	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)
	if not (options.data_type):
		print (__doc__)
		parser.print_help()
		sys.exit(101)				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	FOUT = open(options.out_file, 'w')		
	
	if options.data_type.lower() == "beta":
		printlog("Convert Beta-value file \"%s\" into M-value file \"%s\" ..." % (options.input_file, options.out_file))
	elif options.data_type.lower() == "m":
		printlog("Convert M-value file \"%s\" into Beta-value file \"%s\" ..." % (options.input_file, options.out_file))
	else:
		print ("Data type must be \"Beta\" or \"M\"", file=sys.stderr)
		sys.exit(0)
		
	line_num = 1
	for l in ireader.reader(options.input_file):
		f = l.split()
		if line_num == 1:
			print (l, file=FOUT)
		else:
			probe_ID = f[0]
			input_values = f[1:]
			output_values = []
			for iv in input_values:
				#deal with non-numerical values
				try:
					if options.data_type.lower() == "beta":
						ov = np.log2(float(iv)/(1.0 - float(iv)))
					elif options.data_type.lower() == "m":
						ov = (2**float(iv))/(2**float(iv) + 1)
				except:
					ov = np.nan
				output_values.append(ov)
			print (probe_ID + '\t' + '\t'.join([str(i) for i in output_values]), file=FOUT)
		line_num += 1

	FOUT.close()
Exemplo n.º 6
0
def getBasalDomains(bedfile, up=5000, down=1000, printit=False):
    '''
	Define gene's basal regulatory domain. 
	bedfile: one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript.
	up: size of extension to upstream of TSS
	down: size of extension to downstream of TSS
	'''
    basal_ranges = {}

    for l in ireader.reader(bedfile):
        if l.startswith('#'):
            continue
        if l.startswith('track'):
            continue
        if l.startswith('browser'):
            continue
        f = l.split()
        try:
            chrom = f[0]
            start = int(f[1])
            end = int(f[2])
            symbol = f[3]
            strand = f[5]
            if start > end:
                print("'Start' cannot be larger than 'End'. Skip: " + l,
                      file=sys.stderr)
                continue
        except:
            print("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
            continue

        if chrom not in basal_ranges:
            basal_ranges[chrom] = IntervalTree()

        if strand == '+':
            basal_st = (start + 1) - up
            basal_end = (start + 1) + down
            basal_st = max(0, basal_st)
            basal_ranges[chrom].insert_interval(
                Interval(basal_st, basal_end, strand='+', value=symbol))

        elif strand == '-':
            basal_st = end - down
            basal_end = end + up
            basal_st = max(0, basal_st)
            basal_ranges[chrom].insert_interval(
                Interval(basal_st, basal_end, strand='-', value=symbol))
        if printit:
            print('\t'.join([
                str(i)
                for i in (chrom, basal_st, basal_end, symbol, '0', strand)
            ]),
                  file=sys.stdout)
    return basal_ranges
Exemplo n.º 7
0
def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input data file (Tab-separated) with a certain column containing 450K/850K array CpG IDs. This file can be regular text file or compressed file (.gz, .bz2).")
	parser.add_option("-a","--annotation",action="store",type="string",dest="anno_file",help="Annotation file. This file can be regular text file or compressed file (.gz, .bz2).")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.")
	parser.add_option("-p","--probe_column",action="store",type='int', dest="probe_col",default=0, help="The number of column that contains probe IDs. Note: the column index starts with 0. default=%default.")
	parser.add_option("-l", "--header", action="store_true", dest="header", default=False, help="Input data file has a header row.")
	(options,args)=parser.parse_args()
	
	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.anno_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	

	if not os.path.isfile(options.input_file):
		print ("Input data file \"%s\" does not exist\n" % options.input_file) 
		sys.exit(104)
	if not os.path.isfile(options.anno_file):
		print ("Input annotation file \"%s\" does not exist\n" % options.input_file) 
		sys.exit(105)
	
	printlog("Read annotation file \"%s\" ..." % (options.anno_file))	
	(header, data)= read_annotation(options.anno_file)
	
	OUT = open(options.out_file + '.anno.txt','w')
	printlog("Add annotation information to \"%s\" ..." % (options.input_file))	
	line_num = 0
	for l in ireader.reader(options.input_file):
		line_num += 1
		f = l.split()
		if (line_num == 1 and options.header):
			print (l + '\t' +  '\t'.join(header), file=OUT)
		else:
			if options.probe_col >= len(f):
				print ("Error: column ID must be smaller than %d!" % len(f), file=sys.stderr)
				sys.exit(0)
			cgid = f[options.probe_col]
			try:
				print (l + '\t'  + data[cgid],file=OUT)
			except:
				print (l + '\t' + '\t'.join(['NA']*len(header)), file=OUT)
	OUT.close()		
Exemplo n.º 8
0
def read_annotation(infile):
    head = []
    cpg_infor = {}
    for l in ireader.reader(infile):
        if l.startswith('probeID'):
            head = l.split()[1:]
        else:
            f = l.split()
            cgid = f[0]
            anno = '\t'.join(f[1:])
            cpg_infor[cgid] = anno
    return (head, cpg_infor)
Exemplo n.º 9
0
def read_grp_file2(gfile):
	'''
	read group file. Group file define the biological groups of data matrix file. 
	(1) It must has header
	(2) It must have at least two columns:
		* 1st column: sample names. samples names should be unique, and they must be exactly the same as the first row of beta matrix file.
		* 2nd column: group IDs. 
		* additional columns can be included to indicate co-variables. 
	(3) columns must be separated by ","
	
	For example:
	
	sampleID,survival,Sex
	Normal_1,1,1
	Normal_2,1,2
	Normal_3,1,1	
	Tumor_1,2,1
	Tumor_2,2,2
	Tumor_3,2,1
	...
	...
	'''
	samples = []
	covar_values = []
	covar_names = []
	covars = collections.defaultdict(dict)
	line_num = 0
	for l in ireader.reader(gfile):
		l = l.replace(' ','')
		line_num += 1
		f = l.split(',')
		if len(f) < 2:
			print ("Group fle has at lesat 2 columns!", file=sys.stderr)
			sys.exit(1)
		if line_num == 1:
			covar_names = f[1:]
		else:
			sample_id = f[0]
			samples.append(sample_id)
			covar_values = f[1:]
			
			for a,b in zip(covar_names, covar_values):
				covars[a][sample_id] = b
		
		
	tmp = collections.Counter(samples)
	if tmp.most_common(1)[0][1] > 1:
		print ("Sample names are not unique!", file=sys.stderr)
		sys.exit(0)
		
	return(samples, covar_names, covars)
Exemplo n.º 10
0
def main():
	
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input CpG file in BED3+ format.")
	parser.add_option("-a","--annotation",action="store",type="string",dest="anno_file",help="Input annotation file in BED3+ format.")
	parser.add_option("-w","--window",action="store",type='int', dest="window_size", default=100, help="Size of window centering on the middle-point of each genomic region defined in the annotation BED file (i.e., window_size*0.5 will be extended to up- and down-stream from the middle point of each genomic region). default=%default" )
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
	parser.add_option("-l", "--header", action="store_true", dest="header", default=False, help="If True, the first row of input CpG file is header. default=%default")
	(options,args)=parser.parse_args()

	
	if not (options.input_file):
		print (__doc__)
		#print ('You must specify input file(s)',file=sys.stderr)
		parser.print_help()
		sys.exit(101)
	if not (options.out_file):
		print (__doc__)
		#print ('You must specify the output file',file=sys.stderr)
		parser.print_help()
		sys.exit(102)	
	if not (options.anno_file):
		print (__doc__)
		#print ('You must specify the annotation file',file=sys.stderr)
		parser.print_help()
		sys.exit(103)
	tree = buildIntervalTree(options.anno_file, window_size = options.window_size)
	
	OUT = open(options.out_file + '.anno.txt','w')
	line_num = 0
	printlog("Reading CpG file: %s ..." % options.input_file)
	for line in ireader.reader(options.input_file):	 
		fields = line.rstrip('\n ').split()
		if len(fields) < 3:
			continue
		line_num += 1
		f = line.split()
		if (line_num == 1 and options.header):
			print (line + '\t' +  basename(options.anno_file), file=OUT)
		else:
			chrom = f[0]
			start = int(f[1])
			end = int(f[2])
			overlaps = findIntervals(chrom, start, end, tree)
			if len(overlaps) > 0:
				print (line + '\t' + ','.join(overlaps), file=OUT)
			else:
				print (line + '\tN/A', file=OUT)
	
	OUT.close()
Exemplo n.º 11
0
def buildIntervalTree(bed_file, window_size=0):
    '''
	Build interval tree from annotation BED file.
	window : add this to the middle of each region.
	'''
    ranges = {}
    printlog("Build interval tree from annotation file: %s ..." % bed_file)
    for line in ireader.reader(bed_file):
        if line.startswith("track"): continue
        if line.startswith("#"): continue
        if line.startswith('browser'): continue
        fields = line.rstrip('\n ').split()
        if len(fields) < 3:
            continue
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])

        if window_size > 0:
            # window middle position
            mid = int(start + (end - start) / 2.0)
            if start < 0:
                continue
            if end < 0:
                continue
            if start > end:
                continue

            # window start position
            extension = int(window_size * 0.5)
            w_start = mid - extension
            if w_start < start:
                w_start = start

            # window end position
            w_end = mid + extension
            if w_end > end:
                w_end = end

        if len(fields) >= 4:
            name = fields[3]
        else:
            name = fields[0] + ':' + fields[1] + '-' + fields[2]

        if chrom not in ranges:
            ranges[chrom] = Intersecter()
            ranges[chrom].add_interval(Interval(start, end, value=name))
        else:
            ranges[chrom].add_interval(Interval(start, end, value=name))
    return ranges
Exemplo n.º 12
0
def read_CpG_bed(cpgfile):
    '''
	cpgfile: CpG BED file should have at least 3 columns (Chrom, chromStart, chromEnd).
	Note: chromEnd correspond to the genomic position methylated C.
	beta value is placed at the 4th column, if there is no 4th column (or the 4th column
	is not a number), beta set to 1.
	Additional columns are ignored.
	'''
    cpg_ranges = {}
    for l in ireader.reader(cpgfile):
        if l.startswith('#'):
            continue
        if l.startswith('track'):
            continue
        if l.startswith('browser'):
            continue
        f = l.split()
        if len(f) < 3:
            print("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
            continue

        chrom = f[0]
        start = int(f[1])
        end = int(f[2])
        if start > end:
            print("'Start' cannot be larger than 'End'. Skip: " + l,
                  file=sys.stderr)
            continue

        try:
            beta = float(f[4])
        except:
            beta = 1.0
        try:
            strand = f[5]
        except:
            strand = '+'

        if chrom not in cpg_ranges:
            cpg_ranges[chrom] = IntervalTree()
        if strand == '+':
            cpg_ranges[chrom].insert_interval(Interval(start, end, value=beta))
        elif strand == '-':
            cpg_ranges[chrom].insert_interval(
                Interval(end, end + 1, value=beta))

    return cpg_ranges
Exemplo n.º 13
0
def getBasalDomains(bedfile, printit=False):
    '''
	Define gene's basal regulatory domain. 
	bedfile: one gene one TSS (could use the canonical (longest) isoform, or merge all isoforms into a super transcript.
	'''
    basal_ranges = {}

    for l in ireader.reader(bedfile):
        if l.startswith('#'):
            continue
        if l.startswith('track'):
            continue
        if l.startswith('browser'):
            continue
        f = l.split()
        try:
            chrom = f[0]
            start = int(f[1])
            end = int(f[2])
            symbol = f[3]
            gene_strand = f[5]
            if start > end:
                print("'Start' cannot be larger than 'End'. Skip: " + l,
                      file=sys.stderr)
                continue
            if gene_strand not in ['+', '-']:
                print("Invalid strand. Skip: " + l, file=sys.stderr)
                continue
        except:
            print("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)
            continue

        if chrom not in basal_ranges:
            basal_ranges[chrom] = IntervalTree()

        basal_ranges[chrom].insert_interval(
            Interval(start, end, strand=gene_strand, value=symbol))

        if printit:
            print('\t'.join([
                str(i)
                for i in (chrom, basal_st, basal_end, symbol, '0', strand)
            ]),
                  file=sys.stdout)
    return basal_ranges
Exemplo n.º 14
0
    def getCDSExons(self, uniquify=True, stranded=True):
        '''
		Get only CDS exon regions from BED-12 file. Both 5' and 3' UTR parts are removed.
		uniquify: if the returned blocks should be uniquify. 
		'''
        reblocks = []
        for l in ireader.reader(self.f):
            l = l.strip()
            if l.startswith(('#', 'track', 'browser')): continue
            f = l.split()
            if len(f) < 12:
                print(
                    "\nInput error!\nStandard BED format has 12 columns.\n%s" %
                    (BED12),
                    file=sys.stderr)
                sys.exit(1)

            chrom = f[0]
            chrom_start = int(f[1])
            name = f[4]
            strand = f[5]
            cdsStart = int(f[6])
            cdsEnd = int(f[7])
            blockCount = int(f[9])
            blockSizes = [int(i) for i in f[10].strip(',').split(',')]
            blockStarts = [
                chrom_start + int(i) for i in f[11].strip(',').split(',')
            ]
            cds_exons = []
            genome_seq_index = []
            for base, offset in zip(blockStarts, blockSizes):
                if (base + offset) < cdsStart: continue
                if base > cdsEnd: continue
                exon_start = max(base, cdsStart)
                exon_end = min(base + offset, cdsEnd)
                if stranded:
                    reblocks.append((chrom, exon_start, exon_end, strand))
                else:
                    reblocks.append((chrom, exon_start, exon_end))
        #self.f.seek(0)
        if uniquify:
            return list(set(reblocks))
        else:
            return reblocks
Exemplo n.º 15
0
def read_chromSize(infile):
    '''
	read chromosome size file (tab/space separated plain text file).
	chr1    249250621
	chr2    243199373
	chr3    198022430
	chr4    191154276
	'''
    names = []
    sizes = []
    for l in ireader.reader(infile):
        if l.startswith('#'):
            continue
        f = l.split()
        if len(f) != 2:
            continue
        names.append(f[0])
        sizes.append(int(f[1]))
    return (names, sizes)
Exemplo n.º 16
0
def load_data(infile):
    """
	Input file is tab or space separated plain text file.
	*The first row contains sample IDs (must be unique)
	*The first column contains probe IDs (must be unique)
	*Each cell (except for the 1st row and 1st column) contains Beta-value
	
	Example:
	
	Probe			sample_1	sample_2	sample_3 ...
	cg09835024		0.0547		0.1187		0.0625	...
	cg25813447		0.428		0.3746		0.0666	...
	cg07779434		0.3713		0.4194		0.0493	...
	...
	...
	...
	"""
    printlog("Reading " + infile + " ...")
    line_num = 0
    beta_values = collections.defaultdict(list)
    for l in ireader.reader(infile):
        line_num += 1
        if line_num == 1:
            sample_IDs = l.split()[1:]
            column_num = len(sample_IDs)
            continue
        else:
            f = l.split()
            beta_values['probeID'].append(f[0])
            tmp = list(map(float, f[1:]))

            if len(tmp) != column_num:
                print >> sys.stderr, "The number of columns of this row does NOT match with header row's"
                print >> sys.stderr, l
                continue
            for sid, beta in zip(sample_IDs, tmp):
                beta_values[sid].append(beta)
    print("\tTotal samples: %d" % (len(beta_values.keys()) - 1),
          file=sys.stderr)
    print("\tTotal probes: %d" % (line_num - 1), file=sys.stderr)
    return beta_values
Exemplo n.º 17
0
def read_grp_file1(gfile):
    '''
	read group file. Group file define the biological groups of data matrix file.
	(1) It must has header
	(2) It must have two columns:
		* 1st column: sample names. samples names should be unique, and they must be exactly the same as the first row of beta matrix file.
		* 2nd column: group IDs.
	(3) columns must be separated by ","

	For example:

	sampleID,groupID
	Normal_1,1
	Normal_2,1
	Normal_3,1
	Tumor_1,2
	Tumor_2,2
	Tumor_3,2
	'''
    samples = []
    groups = []
    line_num = 0
    for l in ireader.reader(gfile):
        l = l.replace(' ', '')
        line_num += 1
        f = l.split(',')
        if len(f) < 2:
            print("Group fle must have 2 columns!", file=sys.stderr)
            sys.exit(1)
        if line_num == 1:
            continue
        else:
            samples.append(f[0])
            groups.append(f[1])

    tmp = collections.Counter(samples)
    if tmp.most_common(1)[0][1] > 1:
        print("Sample names are not unique!", file=sys.stderr)
        sys.exit(0)

    return (samples, groups)
Exemplo n.º 18
0
    def getIntergenic(self,
                      direction='up',
                      size=2000,
                      uniquify=True,
                      stranded=True):
        '''get intergenic regions. direction=up or down or both.'''

        reblocks = []
        for l in ireader.reader(self.f):
            l = l.strip()
            if l.startswith(('#', 'track', 'browser')): continue
            f = l.split()
            chrom = f[0]
            tx_start = int(f[1])
            tx_end = int(f[2])
            strand = f[5]
            if (direction == "up" or direction == "both"):
                if strand == '-':
                    region_st = tx_end
                    region_end = tx_end + size
                else:
                    region_st = max(tx_start - size, 0)
                    region_end = tx_start
                reblocks.append((chrom, region_st, region_end, strand))
            if (direction == "down" or direction == "both"):
                if strand == '-':
                    region_st = max(0, tx_start - size)
                    region_end = tx_start
                else:
                    region_st = tx_end
                    region_end = tx_end + size
                if stranded:
                    reblocks.append((chrom, region_st, region_end, strand))
                else:
                    reblocks.append((chrom, region_st, region_end))
        #self.f.seek(0)
        if uniquify:
            return list(set(reblocks))
        else:
            return reblocks
Exemplo n.º 19
0
    def getExons(self, uniquify=True, stranded=True):
        '''
		Get all exons (including both coding exons and UTR exons) from BED-12 file.
		uniquify: if the returned blocks should be uniquify. 
		'''

        reblocks = []
        for l in ireader.reader(self.f):
            l = l.strip()
            if l.startswith(('#', 'track', 'browser')): continue
            f = l.split()
            if len(f) < 12:
                print("Standard BED format has 12 columns.\n%s" % (BED),
                      file=sys.stderr)
                sys.exit(1)
            chrom = f[0]
            chrom_start = int(f[1])
            name = f[4]
            strand = f[5]
            cdsStart = int(f[6])
            cdsEnd = int(f[7])
            blockCount = int(f[9])
            blockSizes = [int(i) for i in f[10].strip(',').split(',')]
            blockStarts = [
                chrom_start + int(i) for i in f[11].strip(',').split(',')
            ]
            for base, offset in zip(blockStarts, blockSizes):
                if stranded:
                    reblocks.append((chrom, base, base + offset, strand))
                else:
                    reblocks.append((chrom, base, base + offset))
        #self.f.seek(0)
        if uniquify:
            return list(set(reblocks))
        else:
            return reblocks
Exemplo n.º 20
0
def main():

    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input_file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "Input CpG file in BED format. The first 3 columns contain \"Chrom\", \"Start\", and \"End\". The 4th column contains proportion values."
    )
    parser.add_option(
        "-a",
        "--alpha",
        action="store",
        type='float',
        dest="alpha_cut",
        default=0.05,
        help=
        "The chance of mistakingly assign a particular CpG as an outlier for each genomic region. default=%default"
    )
    parser.add_option("-b",
                      "--bed",
                      action="store",
                      type="string",
                      dest="bed_file",
                      help="BED3+ file specifying the genomic regions.")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type='string',
                      dest="out_file",
                      help="The prefix of the output file.")
    (options, args) = parser.parse_args()

    if not (options.input_file):
        print(__doc__)
        parser.print_help()
        sys.exit(101)

    if not (options.out_file):
        print(__doc__)
        parser.print_help()
        sys.exit(103)

    if options.alpha_cut < 0:
        options.alpha_cut = 0.05
    if options.alpha_cut > 1:
        options.alpha_cut = 1

    tree = buildIntervalTree(options.input_file)

    OUT = open(options.out_file, 'w')
    print(
        "#chrom\tstart\tend\tN_CpG_filtered\tN_methyl_filtered\tN_total_filtered\tN_CpG_ori\tN_methy_ori\tN_total_ori",
        file=OUT)
    for line in ireader.reader(options.bed_file):
        line = line.strip()
        if line.startswith("track"): continue
        if line.startswith("#"): continue
        if line.startswith('browser'): continue
        if line.startswith('Chrom'): continue

        f = line.split()
        if len(f) < 3:
            continue
        try:
            chrom = f[0]
            start = int(f[1])
            end = int(f[2])
        except:
            continue

        a = findIntervals(chrom, start, end, tree, a=options.alpha_cut)
        if len(a) == 0:
            print('\t'.join(f[0:3]) + '\t' + '\t'.join(['N/A'] * 6), file=OUT)
        else:
            print('\t'.join(f[0:3]) + '\t' + '\t'.join([str(i) for i in a]),
                  file=OUT)
    OUT.close()
Exemplo n.º 21
0
def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".")
	parser.add_option("-f","--family",action="store",type="int",dest="family_func",default=1, help="Error distribution and link function to be used in the GLM model. Can be integer 1 or 2 with 1 = \"quasibinomial\" and 2 = \"binomial\". Default=%default.")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
	(options,args)=parser.parse_args()
	
	print ()
	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.group_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	if not os.path.isfile(options.input_file):
		print ("Input data file \"%s\" does not exist\n" % options.input_file) 
		sys.exit(104)
	if not os.path.isfile(options.group_file):
		print ("Input group file \"%s\" does not exist\n" % options.input_file) 
		sys.exit(105)
	
	ROUT = open(options.out_file + '.r','w')
	family = {1:'quasibinomial', 2:'binomial',}
	if not options.family_func in family.keys():
		print ("Incorrect value of '-f'!") 
		sys.exit(106)
		
	printlog("Read group file \"%s\" ..." % (options.group_file))
	(samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file)
	for cv_name in cv_names:
		print ("%s: %s" % (cv_name, v_types[cv_name]))
		for sample in samples:
			print ('\t' + sample + '\t' + cvs[cv_name][sample])
	
	primary_variable = cv_names[0]
	
	print ('lrf1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
	print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT)
	print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
	print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
	print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
	print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
	print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(gsub("2","",names(coefs)), "coef",sep="."), paste(gsub("2","",names(pvals)), "pval",sep=".")))' % (options.out_file + '.results.txt'),  file = ROUT) 
	print ('}', file=ROUT)	
	print ('\n', file=ROUT)

	print ('lrf2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
	print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT)
	print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
	print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
	print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
	print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
	print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append = TRUE)' % (options.out_file + '.results.txt'),  file = ROUT) 
	print ('}', file=ROUT)	
	print ('\n', file=ROUT)
		
	printlog("Processing file \"%s\" ..." % (options.input_file))
	line_num = 0
	probe_list = []
	p_list = []
	for l in ireader.reader(options.input_file):
		line_num += 1
		f = l.split()
		if len(f) == 0: continue
		if line_num == 1:
			sample_IDs = f[1:]
			# check if sample ID matches
			for s in samples:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
			#for cv_name in cv_names:
			#	print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in  sample_IDs  ])), file = ROUT)
			for cv_name in cv_names:
				if v_types[cv_name] == 'continuous':
					print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in  sample_IDs  ])), file = ROUT)
				elif  v_types[cv_name] == 'categorical':
					print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in  sample_IDs  ])), file = ROUT)
				else:
					printlog("unknown vaiable type!")
					sys.exit(1)

			print ('\n', file=ROUT)
			continue
		else:
			methyl_reads = []			# c
			total_reads = []	# n
			cg_id = f[0]
			for i in f[1:]:
				#try:
				m = re.match(r'(\d+)\s*\,\s*(\d+)', i)
				if m is None:
					methyl_reads.append("NaN")
					total_reads.append("NaN")
					continue
				else:
					c = int(m.group(1))
					n = int(m.group(2))
					if n >= c and n > 0:
						methyl_reads.append(c)
						total_reads.append(n)
					else:
						printlog("Incorrect data format!")
						print (f)
						sys.exit(1)		
			if line_num == 2:
				print ('lrf1(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
			else:
				print ('lrf2(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)

	ROUT.close()
	
	
	try:
		printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
		subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True)
	except:
		print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
		sys.exit(1)


	# read
	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
	
	line_num = 0
	p_list = []
	probe_list = []
	for l in open(options.out_file + '.results.txt', 'r'):
		l = l.strip()
		line_num += 1
		if line_num == 1:
			headers = l.split()
			primary_v_index = headers.index(primary_variable + '.pval')
		else:
			v = l.split()
			try:
				pv = float(v[primary_v_index])
			except:
				continue
			if pv >= 0 and pv <= 1:
				p_list.append(pv)
				probe_list.append(v[0])

	
	# adjust
	q_list =  padjust.multiple_testing_correction(p_list)
	
	# write
	adjusted_p = {}
	for id,p,q in zip(probe_list, p_list, q_list):
		adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
	FOUT = open(options.out_file + '.pval.txt','w')
	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
	line_num = 1
	for l in ireader.reader(options.input_file):
		if line_num == 1:
			print (l + '\tpval\tadj.pval', file=FOUT)
		else:
			f = l.split()
			probe_ID = f[0]
			try:
				print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
			except:
				print (l + '\tNaN\tNaN', file=FOUT)
		line_num += 1
	FOUT.close()
Exemplo n.º 22
0
def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be regular text file or compressed file (*.gz, *.bz2) or accessible url.")
	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file define the biological groups of each samples as well as other covariables such as gender, age.  Sample IDs shoud match to the \"Data file\".")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of output file.")
	(options,args)=parser.parse_args()
	
	print ()
	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.group_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	if not os.path.isfile(options.input_file):
		print ("Input data file \"%s\" does not exist\n" % options.input_file) 
		sys.exit(104)
	if not os.path.isfile(options.group_file):
		print ("Input group file \"%s\" does not exist\n" % options.input_file) 
		sys.exit(105)
	
	FOUT = open(options.out_file + '.pval.txt','w')
	ROUT = open(options.out_file + '.r','w')
	
	printlog("Read group file \"%s\" ..." % (options.group_file))
	(samples,cv_names, cvs) = read_grp_file2(options.group_file)
	for cv_name in cv_names:
		print (cv_name)
		for sample in samples:
			print ('\t' + sample + '\t' + cvs[cv_name][sample])
	
	printlog("Processing file \"%s\" ..." % (options.input_file))
	line_num = 0
	probe_list = []
	p_list = []
	for l in ireader.reader(options.input_file):
		line_num += 1
		f = l.split()
		if line_num == 1:
			sample_IDs = f[1:]
			# check if sample ID matches
			for s in samples:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
			continue
		else:
			beta_values = []
			cg_id = f[0]
			for i in f[1:]:
				try:
					beta_values.append(float(i))
				except:
					beta_values.append("NaN")
			print ('',file=ROUT)
			print ('cgid <- \"!%s\"' % cg_id, file=ROUT)
			print ("y <- c(%s)" % (','.join([str(beta) for beta in beta_values])), file=ROUT)	#response variable
			for cv_name in cv_names:
				print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in  sample_IDs  ])), file = ROUT)
			print ('try(fit <- glm(y ~ %s, family=gaussian))' % ('+'.join(cv_names)), file = ROUT)
			print ('pval <- coef(summary(fit))[,4]',file=ROUT)
			print ('coef <- coef(summary(fit))[,1]',file=ROUT)
			print ('cat(cgid, names(pval),pval,coef, sep="\\t")', file=ROUT)
			print ('cat("\\n")', file=ROUT)
	ROUT.close()
	
	
	try:
		printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
		subprocess.call("Rscript %s >%s 2>%s" % (options.out_file + '.r', options.out_file + '.r.results.txt',options.out_file + '.r.warnings.txt' ), shell=True)
	except:
		print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
		sys.exit(1)
	
	
	printlog("Reading file \"%s\" ..." % (options.out_file + '.r.results.txt'))
	glm_results = {}
	for l in open(options.out_file + '.r.results.txt'):
		l = l.strip()
		if not l.startswith('!'):continue
		l = l.replace(')','')
		l = l.replace('(','')
		f = l.split('\t')
		cgID = f[0].replace('!','')
		tmp = f[1:]
		if len(tmp)%3 == 0:
			chunk_size = int(len(tmp)/3)
			sub_lists = [tmp[i:i+chunk_size] for i in range(0,len(tmp),chunk_size)]
			v_names = sub_lists[0][1:]
			v_pvals = sub_lists[1][1:]
			v_coefs = sub_lists[2][1:]
			glm_results[cgID] = [v_coefs, v_pvals]
		else:
			glm_results[cgID] = [["NaN"]* len(cv_names), ["NaN"]* len(cv_names)]
	
	printlog("Results saved to \"%s\" ..." % (options.out_file + '.pval.txt'))
	line_num = 0
	for l in ireader.reader(options.input_file):
		line_num += 1
		f = l.split()
		if line_num == 1:
			print (l + '\t' + '\t'.join([i + '.coef' for i in v_names]) + '\t' + '\t'.join([i + '.pval' for i in v_names]), file=FOUT)
		else:
			cgID = f[0]
			print (l + '\t' + '\t'.join(glm_results[cgID][0]) + '\t' + '\t'.join(glm_results[cgID][1]), file=FOUT)
	
	FOUT.close()
Exemplo n.º 23
0
def main():
    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input-file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url."
    )
    parser.add_option(
        "-g",
        "--group",
        action="store",
        type="string",
        dest="group_file",
        help=
        "Group file define the biological groups of each samples. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs.  It must have a header row. Sample IDs shoud match to the \"Data file\". Note: automatically switch to use ANOVA if more than 2 groups were defined in this file."
    )
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type='string',
                      dest="out_file",
                      help="Prefix of output file.")
    (options, args) = parser.parse_args()

    print()
    #print (options.paired)
    #print (options.welch_ttest)
    if not (options.input_file):
        print(__doc__)
        parser.print_help()
        sys.exit(101)

    if not (options.group_file):
        print(__doc__)
        parser.print_help()
        sys.exit(102)

    if not (options.out_file):
        print(__doc__)
        parser.print_help()
        sys.exit(103)

    FOUT = open(options.out_file + '.pval.txt', 'w')
    #ROUT = open(options.out_file + '.r','w')

    printlog("Read group file \"%s\" ..." % (options.group_file))
    (s, g) = read_grp_file1(options.group_file)
    s2g = dict(zip(s, g))
    g2s = collections.defaultdict(list)

    for k, v in s2g.items():
        g2s[v].append(k)

    group_IDs = sorted(g2s.keys())
    for g in group_IDs:
        print("\tGroup %s has %d samples:" % (g, len(g2s[g])))
        print('\t\t' + ','.join(g2s[g]))

    if len(group_IDs) != 2:
        printlog("You must have two groups!", file=sys.stderr)
        sys.exit(1)

    line_num = 1
    probe_list = []
    p_list = []
    or_list = []
    for l in ireader.reader(options.input_file):
        f = l.split()
        if line_num == 1:
            sample_IDs = f[1:]
            # check if sample ID matches
            for s in s2g:
                if s not in sample_IDs:
                    printlog("Cannot find sample ID \"%s\" from file \"%s\"" %
                             (s, options.input_file))
                    sys.exit(3)
        else:
            cg_id = f[0]
            probe_list.append(cg_id)
            proportions = f[1:]
            methyl_reads = 0
            unmethyl_reads = 0
            g2values = collections.defaultdict(dict)
            for g in group_IDs:
                g2values[g]['methyl'] = 0
                g2values[g]['unmethyl'] = 0
            for s, p in zip(sample_IDs, proportions):
                gid = s2g[s]
                m = re.match(r'(\d+)\s*\,\s*(\d+)', p)
                if m is None:
                    continue
                else:
                    c = int(m.group(1))
                    n = int(m.group(2))
                    if n >= c and n > 0:
                        g2values[gid]['methyl'] += c
                        g2values[gid]['unmethyl'] += (n - c)
                    else:
                        printlog("Incorrect data format!")
                        print(f)
                        sys.exit(1)
            (odds,
             pval) = stats.fisher_exact([[
                 g2values[group_IDs[0]]['methyl'],
                 g2values[group_IDs[0]]['unmethyl']
             ],
                                         [
                                             g2values[group_IDs[1]]['methyl'],
                                             g2values[group_IDs[1]]['unmethyl']
                                         ]])
            #print (g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl'],g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl'])
            p_list.append(pval)
            or_list.append(odds)
        line_num += 1

    printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
    adjusted_p = {}
    q_list = padjust.multiple_testing_correction(p_list)
    for id, o, p, q in zip(probe_list, or_list, p_list, q_list):
        adjusted_p[id] = '\t'.join([str(i) for i in (o, p, q)])

    printlog("Writing to %s" % (options.out_file + '.pval.txt'))
    line_num = 1
    for l in ireader.reader(options.input_file):
        if line_num == 1:
            print(l + '\tOddsRatio\tpval\tadj.pval', file=FOUT)
        else:
            f = l.split()
            probe_ID = f[0]
            print(l + '\t' + adjusted_p[probe_ID], file=FOUT)
        line_num += 1
    FOUT.close()
Exemplo n.º 24
0
def main():

    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input-file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "BED3+ file specifying the C position. BED3+ file could be a regular text file or compressed file (*.gz, *.bz2) or accessible url. [required]"
    )
    parser.add_option(
        "-r",
        "--refgene",
        action="store",
        type="string",
        dest="gene_file",
        help=
        "Reference gene model in BED12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). It is recommended that multiple transcripts of the same gene are collapsed into a single super transcript with one TSS and one name."
    )
    parser.add_option(
        "-u",
        "--basal-up",
        action="store",
        type="int",
        dest="basal_up_size",
        default=5000,
        help=
        "Size of extension to upstream of TSS (used to define gene's \"basal regulatory domain\"). default=%default (bp)"
    )
    parser.add_option(
        "-d",
        "--basal-down",
        action="store",
        type="int",
        dest="basal_down_size",
        default=1000,
        help=
        "Size of extension to downstream of TSS (used to define gene's basal regulatory domain). default=%default (bp)"
    )
    parser.add_option(
        "-e",
        "--extension",
        action="store",
        type="int",
        dest="extension_size",
        default=1000000,
        help=
        "Size of extension to both up- and down-stream of TSS (used to define gene's \"extended regulatory domain\"). default=%default (bp)"
    )
    parser.add_option(
        "-o",
        "--output",
        action="store",
        type='string',
        dest="out_file",
        help=
        "Prefix of output file. Two addtional columns will be appended to the orignal BED file with the last column indicating \"genes whose extended regulatory domain are overlapped with the CpG\", the 2nd last column indicating \"genes whose basal regulatory domain are overlapped with the CpG\". [required]"
    )
    (options, args) = parser.parse_args()

    print()

    if not (options.input_file):
        #print ('You must specify input file(s)',file=sys.stderr)
        print(__doc__)
        parser.print_help()
        sys.exit(101)
    if not (options.gene_file):
        #print ('You must specify the chrom size file',file=sys.stderr)
        print(__doc__)
        parser.print_help()
        sys.exit(102)
    if not (options.out_file):
        #print ('You must specify the output file',file=sys.stderr)
        print(__doc__)
        parser.print_help()
        sys.exit(103)

    FOUT = open(options.out_file + '.annotatio.txt', 'w')

    printlog("Calculate basal regulatory domain from: \"%s\" ..." %
             (options.gene_file))
    basal_domains = getBasalDomains(bedfile=options.gene_file,
                                    up=options.basal_up_size,
                                    down=options.basal_down_size,
                                    printit=False)

    printlog("Calculate extended regulatory domain from: \"%s\" ..." %
             (options.gene_file))
    extended_domains = geteExtendedDomains(basal_ranges=basal_domains,
                                           bedfile=options.gene_file,
                                           up=options.basal_up_size,
                                           down=options.basal_down_size,
                                           ext=options.extension_size,
                                           printit=False)

    overlap = extended_domains['chr1'].find(2161048, 2161049)

    printlog("Assigning CpG to gene ...")
    for l in ireader.reader(options.input_file):
        if l.startswith('#'):
            print(l, file=FOUT)
            continue
        if l.startswith('track'):
            continue
        if l.startswith('browser'):
            continue
        try:
            f = l.split()
            chrom = f[0]
            start = int(f[1])
            end = int(f[2])
        except:
            print("Invalid BED line: %s" % l, file=sys.stderr)
            continue

        basal_genes = set()  #genes whose basal domain is overlapped with CpG
        if chrom not in basal_domains:
            basal_genes.add('//')
        else:
            overlaps = basal_domains[chrom].find(start, end)
            if len(overlaps) == 0:
                basal_genes.add('//')
            else:
                for o in overlaps:
                    basal_genes.add(o.value)

        extend_genes = set(
        )  #genes whose extended domain is overlapped with CpG
        if chrom not in extended_domains:
            extend_genes.add('//')
        else:
            overlaps = extended_domains[chrom].find(start, end)
            if len(overlaps) == 0:
                extend_genes.add('//')
            else:
                for o in overlaps:
                    extend_genes.add(o.value)

        extend_genes = extend_genes - basal_genes
        if len(extend_genes) == 0:
            extend_genes.add('//')
        print(l + '\t' + ';'.join(basal_genes) + '\t' + ';'.join(extend_genes),
              file=FOUT)
    FOUT.close()
Exemplo n.º 25
0
		print ("\tGroup %s has %d samples:" % (g, len(g2s[g])), file=sys.stderr)
		print ('\t\t' + ','.join(g2s[g]), file=sys.stderr)
	
	if len(group_IDs) != 2:
		printlog("You must have two groups!", file=sys.stderr)
		sys.exit(1)
	

	manager = Manager()
	results = manager.list()	 #list of list. shared variable between main() and beta_bayes(). #ID, group1.mean, group2.mean, prob
        	
	printlog("Read data file \"%s\" ..." % (options.input_file))
	line_num = 0
	p_count = 0
	jobs = []
	for l in ireader.reader(options.input_file):
		line_num += 1
		f = l.split()
		if len(f) == 0: continue
		if line_num == 1:
			sample_IDs = f[1:]
			# check if sample ID matches
			for s in s2g:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
			g_IDs = [s2g[i] for i in sample_IDs]
			
		else:
			probe_ID = f[0]
			p_count += 1
Exemplo n.º 26
0
def main():
    print(__doc__)
    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input_file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "BED file specifying the C position. This BED file should have at least six columns (Chrom, ChromStart, ChromeEnd, name, score, strand).  Note: Must provide correct *strand* information. This file can be a regular text file or compressed file (.gz, .bz2)."
    )
    parser.add_option(
        "-r",
        "--refgenome",
        action="store",
        type="string",
        dest="genome_file",
        help=
        "Reference genome seqeunces in FASTA format. Must be indexed using the samtools \"faidx\" command. "
    )
    parser.add_option(
        "-e",
        "--extend",
        action="store",
        type="int",
        dest="extend_size",
        default=5,
        help=
        "Number of bases extended to up- and down-stream. default=%default (bp)"
    )
    parser.add_option("-n",
                      "--name",
                      action="store",
                      type='string',
                      dest="motif_name",
                      default='motif',
                      help="Motif name. default=%default")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type='string',
                      dest="out_file",
                      help="The prefix of the output file.")
    (options, args) = parser.parse_args()

    print()

    if not (options.input_file):
        parser.print_help()
        sys.exit(101)

    if not (options.genome_file):
        parser.print_help()
        sys.exit(102)
    #index refegenome file if it hasn't been done
    if not os.path.exists(options.genome_file + '.fai'):
        printlog("Creating index for %s" % options.genome_file)
        pysam.faidx(options.genome_file)

    if not (options.out_file):
        parser.print_help()
        sys.exit(103)

    refFasta = pysam.Fastafile(options.genome_file)
    FOUT = open(options.out_file + '.fa', 'w')

    printlog("Reading %s ..." % options.input_file)
    for l in ireader.reader(options.input_file):
        if l.startswith('#'):
            continue
        if l.startswith('track'):
            continue
        if l.startswith('browser'):
            continue
        f = l.split()
        if '-' in f:
            strand = '-'
        else:
            strand = '+'
        try:
            chrom = f[0]
            position = int(f[2])
        except:
            print("BED has at lesat 4 columns. Skip: " + l, file=sys.stderr)

        start = position - options.extend_size - 1
        end = position + options.extend_size
        if start < 0 or start > end:
            continue

        fa_name = '>' + '_'.join([str(i) for i in (chrom, start, end, strand)])
        fa_seq = refFasta.fetch(chrom, start, end).upper()
        if strand == '-':
            fa_seq = revcomp(fa_seq)
        print(fa_name, file=FOUT)
        print(fa_seq, file=FOUT)
    FOUT.close()

    printlog("Generate motif logo ... ")
    try:
        subprocess.call(
            "weblogo  --format PDF -D fasta -c classic -s large -f %s -o %s -t %s"
            % (options.out_file + '.fa', options.out_file + '.logo.pdf',
               options.motif_name),
            shell=True)
        subprocess.call(
            "weblogo  --format PNG -D fasta -c classic -s large -f %s -o %s -t %s"
            % (options.out_file + '.fa', options.out_file + '.logo.png',
               options.motif_name),
            shell=True)
    except:
        print(
            "Cannot run weblogo. Please install weblogo (https://github.com/WebLogo/weblogo)",
            file=sys.stderr)
        pass
    printlog("Motif logo saved to \"%s\" and \"%s\"" %
             (options.out_file + '.logo.pdf', options.out_file + '.logo.png'))

    m = PSSM(sites=options.out_file + '.fa', name=options.motif_name)

    printlog("Write position frequency matrix (PFM) to \"%s\"" %
             (options.out_file + '.pfm'))
    FF = open(options.out_file + '.pfm', 'w')
    m.toPFM(FOUT=FF)
    FF.close()

    printlog("Write position probability matrix (PPM) to \"%s\"" %
             (options.out_file + '.ppm'))
    FF = open(options.out_file + '.ppm', 'w')
    m.toPPM(FOUT=FF)
    FF.close()

    printlog("Write position weight matrix (PWM) to \"%s\"" %
             (options.out_file + '.pwm'))
    FF = open(options.out_file + '.pwm', 'w')
    m.toPWM(FOUT=FF)
    FF.close()

    printlog("Write Jaspar format matrix to \"%s\"" %
             (options.out_file + '.jaspar'))
    FF = open(options.out_file + '.jaspar', 'w')
    m.toJaspar(FOUT=FF)
    FF.close()

    printlog("Write MEME format matrix to \"%s\"" %
             (options.out_file + '.meme'))
    FF = open(options.out_file + '.meme', 'w')
    m.toMEME(FOUT=FF)
    FF.close()
Exemplo n.º 27
0
def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).")
	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated two columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use  Kruskal-Wallis H-test if more than two groups were defined in this file.")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
	(options,args)=parser.parse_args()
	
	print ()

	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.group_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	FOUT = open(options.out_file + '.pval.txt','w')
	
	printlog("Read group file \"%s\" ..." % (options.group_file))
	(s,g) = read_grp_file1(options.group_file)
	s2g = dict(zip(s,g))
	g2s = collections.defaultdict(list)
	
	for k,v in s2g.items():
		g2s[v].append(k)
	
	group_IDs = sorted(g2s.keys())
	for g in group_IDs:
		print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
		print ('\t\t' + ','.join(g2s[g]))
	
	if len(group_IDs) < 2:
		printlog("You must have at least two groups!", file=sys.stderr)
		sys.exit(1)
	elif len(group_IDs) == 2:
		printlog("Perfrom Mann-Whitney rank test of two samples ...")
	elif len(group_IDs) >= 3:
		printlog("Perfrom Kruskal-Wallis H-test ...")
	
	line_num = 1
	probe_list = []
	p_list = []
	for l in ireader.reader(options.input_file):
		f = l.split()
		if len(f) == 0: continue
		if line_num == 1:
			sample_IDs = f[1:]

			# check if sample ID matches
			for s in s2g:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
		else:
			g2values = collections.defaultdict(list)
			probe_ID = f[0]
			beta_values = f[1:]
			for s,b in zip(sample_IDs, beta_values):
			
				#deal with non-numerical values
				try:
					b = float(b)
				except:
					b = np.nan
				
				#skip if s not in group file
				if s not in s2g:
					continue
				
				gid = s2g[s]
				g2values[gid].append(b)			
			
			if len(g2values) == 2:
				a = np.array(g2values[group_IDs[0]])
				b = np.array(g2values[group_IDs[1]])
				(pval,tscore) = mwu_test(a,b)
			elif len(g2values) >= 3:
				tmp = []
				for g in group_IDs:
					tmp.append(np.array(g2values[g]))
				(pval,tscore) = kruskal_test(*tmp)
			probe_list.append(probe_ID)
			p_list.append(pval)
		line_num += 1
	
	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
	adjusted_p = {}
	q_list =  padjust.multiple_testing_correction(p_list)
	for id,p,q in zip(probe_list, p_list, q_list):
		adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
	
	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
	line_num = 1
	for l in ireader.reader(options.input_file):
		if line_num == 1:
			print (l + '\tpval\tadj.pval', file=FOUT)
		else:
			f = l.split()
			probe_ID = f[0]
			print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
		line_num += 1
	FOUT.close()
Exemplo n.º 28
0
def main():
    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input_file",
        action="store",
        type="string",
        dest="input_file",
        help=
        "Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (.gz, .bz2)"
    )
    parser.add_option(
        "-g",
        "--group",
        action="store",
        type="string",
        dest="group_file",
        help=
        "Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\"."
    )
    parser.add_option("-o",
                      "--output",
                      action="store",
                      type='string',
                      dest="out_file",
                      help="The prefix of the output file.")
    (options, args) = parser.parse_args()

    print()
    if not (options.input_file):
        print(__doc__)
        parser.print_help()
        sys.exit(101)

    if not (options.group_file):
        print(__doc__)
        parser.print_help()
        sys.exit(102)

    if not (options.out_file):
        print(__doc__)
        parser.print_help()
        sys.exit(103)

    if not os.path.isfile(options.input_file):
        print("Input data file \"%s\" does not exist\n" % options.input_file)
        sys.exit(104)
    if not os.path.isfile(options.group_file):
        print("Input group file \"%s\" does not exist\n" % options.input_file)
        sys.exit(105)

    ROUT = open(options.out_file + '.r', 'w')

    print('library("aod")', file=ROUT)

    printlog("Read group file \"%s\" ..." % (options.group_file))
    ####
    (samples, cv_names, cvs, v_types) = read_grp_file2(options.group_file)
    for cv_name in cv_names:
        print("%s: %s" % (cv_name, v_types[cv_name]))
        for sample in samples:
            print('\t' + sample + '\t' + cvs[cv_name][sample])
    ####

    print('bbr1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
    print('\tdat <- data.frame(m=m, t=t, %s)' %
          ','.join(['='.join(i) for i in zip(cv_names, cv_names)]),
          file=ROUT)
    print(
        '\tfit <- betabin(cbind(m,t - m) ~ %s, ~1, link=c("logit"), data=na.omit(dat))'
        % '+'.join(cv_names),
        file=ROUT)
    print('\ttest <- summary(fit)', file=ROUT)
    print('\tcoefs <- test@Coef$Estimate', file=ROUT)
    print('\tpvals = test@Coef$"Pr(> |z|)"', file=ROUT)
    print('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
    print('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
    print('\tnames = row.names(test@Coef)', file=ROUT)
    print('\tnames = gsub("2","",names)', file=ROUT)
    print(
        '\twrite.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(names, "coef",sep="."), paste(names, "pval",sep=".")))'
        % (options.out_file + '.results.txt'),
        file=ROUT)
    print('}', file=ROUT)
    print('\n', file=ROUT)

    print('bbr2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
    print('\tdat <- data.frame(m=m, t=t, %s)' %
          ','.join(['='.join(i) for i in zip(cv_names, cv_names)]),
          file=ROUT)
    print(
        '\tfit <- betabin(cbind(m,t - m) ~ %s, ~1, link=c("logit"), data=na.omit(dat))'
        % '+'.join(cv_names),
        file=ROUT)
    print('\ttest <- summary(fit)', file=ROUT)
    print('\tcoefs <- test@Coef$Estimate', file=ROUT)
    print('\tpvals = test@Coef$"Pr(> |z|)"', file=ROUT)
    print('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
    print('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
    print('\tnames = row.names(test@Coef)', file=ROUT)
    print('\tnames = gsub("2","",names)', file=ROUT)
    print(
        '\twrite.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1), quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append=TRUE)'
        % (options.out_file + '.results.txt'),
        file=ROUT)
    print('}', file=ROUT)
    print('\n', file=ROUT)

    printlog("Processing file \"%s\" ..." % (options.input_file))
    line_num = 0
    probe_list = []
    p_list = []
    for l in ireader.reader(options.input_file):
        line_num += 1
        f = l.split()
        if len(f) == 0: continue
        if line_num == 1:
            sample_IDs = f[1:]
            # check if sample ID matches
            for s in samples:
                if s not in sample_IDs:
                    printlog("Cannot find sample ID \"%s\" from file \"%s\"" %
                             (s, options.input_file))
                    sys.exit(3)
            ####
            for cv_name in cv_names:
                if v_types[cv_name] == 'continuous':
                    print(cv_name + ' <- c(%s)' %
                          (','.join([str(cvs[cv_name][s])
                                     for s in sample_IDs])),
                          file=ROUT)
                elif v_types[cv_name] == 'categorical':
                    print(cv_name + ' <- as.factor(c(%s))' %
                          (','.join([str(cvs[cv_name][s])
                                     for s in sample_IDs])),
                          file=ROUT)
                else:
                    printlog("unknown vaiable type!")
                    sys.exit(1)
            ####
            print('\n', file=ROUT)

            continue
        else:
            methyl_reads = []  # c
            total_reads = []  # n
            cg_id = f[0]
            for i in f[1:]:
                #try:
                m = re.match(r'(\d+)\s*\,\s*(\d+)', i)
                if m is None:
                    methyl_reads.append("NaN")
                    total_reads.append("NaN")
                    continue
                else:
                    c = int(m.group(1))
                    n = int(m.group(2))
                    if n >= c and n > 0:
                        methyl_reads.append(c)
                        total_reads.append(n)
                    else:
                        printlog("Incorrect data format!")
                        print(f)
                        sys.exit(1)
            if line_num == 2:
                print(
                    'bbr1(\"%s\", c(%s), c(%s), %s)' %
                    (cg_id, ','.join([str(read) for read in methyl_reads]),
                     ','.join([str(read)
                               for read in total_reads]), ','.join(cv_names)),
                    file=ROUT)
            else:
                print(
                    'bbr2(\"%s\", c(%s), c(%s), %s)' %
                    (cg_id, ','.join([str(read) for read in methyl_reads]),
                     ','.join([str(read)
                               for read in total_reads]), ','.join(cv_names)),
                    file=ROUT)
    ROUT.close()

    try:
        printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
        subprocess.call(
            "Rscript %s 2>%s" %
            (options.out_file + '.r', options.out_file + '.warnings.txt'),
            shell=True)
    except:
        print("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'),
              file=sys.stderr)
        sys.exit(1)
    """
Exemplo n.º 29
0
def main():
	usage="%prog [options]" + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs.  It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use ANOVA if more than 2 groups were defined in this file.")
	parser.add_option("-p","--paired",action="store_true",default=False,dest="paired",help="If '-p/--paired' flag was specified, use paired t-test which requires the equal number of samples in both groups. Paired sampels are matched by the order. This option will be ignored for multiple group analysis.")
	parser.add_option("-w","--welch",action="store_true",default=False,dest="welch_ttest",help="If '-w/--welch' flag was specified, using Welch's t-test which does not assume the two samples have equal variance.  If omitted, use standard two-sample t-test (i.e. assuming the two samples have equal variance). This option will be ignored for paired t-test and multiple group analysis.")
	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="Prefix of the output file.")
	(options,args)=parser.parse_args()
	
	print ()
	#print (options.paired)
	#print (options.welch_ttest)
	if not (options.input_file):
		print (__doc__)
		parser.print_help()
		sys.exit(101)

	if not (options.group_file):
		print (__doc__)
		parser.print_help()
		sys.exit(102)
				
	if not (options.out_file):
		print (__doc__)
		parser.print_help()
		sys.exit(103)	
	
	FOUT = open(options.out_file + '.pval.txt','w')
	#ROUT = open(options.out_file + '.r','w')
	
	printlog("Read group file \"%s\" ..." % (options.group_file))
	(ss,gs) = read_grp_file1(options.group_file)
	
	s2g = {}
	for s,g in zip(ss,gs):
		s2g[s] = g	
	
	g2s = collections.defaultdict(list)
	for s,g in zip(ss, gs)
		g2s[g].append(s)
	
	group_IDs = sorted(g2s.keys())
	for g in group_IDs:
		print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
		print ('\t\t' + ','.join(g2s[g]))
	
	if len(group_IDs) < 2:
		printlog("You must have at least two groups!", file=sys.stderr)
		sys.exit(1)
	elif (len(group_IDs) == 2) and (options.paired is True):
		printlog("Perfrom paired t-test of two related samples ...")
		if len(g2s[group_IDs[0]]) != len(g2s[group_IDs[1]]):
			printlog("Unequal sample size. Cannot perform paired t-test.")
			sys.exit(2)
	elif (len(group_IDs) == 2) and (options.paired is False):
		printlog("Perfrom standard t-test of two independent samples ...")
	elif len(group_IDs) >= 3:
		printlog("Perfrom ANOVA ...")
	
	line_num = 1
	probe_list = []
	p_list = []
	for l in ireader.reader(options.input_file):
		f = l.split()
		if line_num == 1:
			
			sample_IDs = f[1:]

			# check if sample ID matches
			for s in s2g:
				if s not in sample_IDs:
					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
					sys.exit(3)
		else:
			g2values = collections.defaultdict(list)
			probe_ID = f[0]
			beta_values = f[1:]
			for s,b in zip(sample_IDs, beta_values):
			
				#deal with non-numerical values
				try:
					b = float(b)
				except:
					b = np.nan
				
				#skip if s not in group file
				if s not in s2g:
					continue
				
				gid = s2g[s]
				g2values[gid].append(b)
			
			if len(g2values) == 2:
				a = np.array(g2values[group_IDs[0]])
				b = np.array(g2values[group_IDs[1]])
				if options.paired:
					(pval,tscore) = paired_ttest(a,b)
				else:
					(pval,tscore) = standard_ttest(a,b, equalVar = options.welch_ttest)				
			elif len(g2values) >= 3:
				tmp = []
				for g in group_IDs:
					tmp.append(np.array(g2values[g]))
				(pval,tscore) = anova(*tmp)
			probe_list.append(probe_ID)
			p_list.append(pval)
		line_num += 1
	
	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
	adjusted_p = {}
	q_list =  padjust.multiple_testing_correction(p_list)
	for id,p,q in zip(probe_list, p_list, q_list):
		adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
	
	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
	line_num = 1
	for l in ireader.reader(options.input_file):
		if line_num == 1:
			print (l + '\tpval\tadj.pval', file=FOUT)
		else:
			f = l.split()
			probe_ID = f[0]
			print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
		line_num += 1
	FOUT.close()
Exemplo n.º 30
0
def geteExtendedDomains(basal_ranges, bedfile, up_ext=2000, down_ext=2000, min_gene = 200, printit = False):
	'''
	Define gene's extended regulatory domain. 
	bedfile:one gene one TSS (could use the canonical (longest) isoform, or merge all
			isoforms into a super transcript.
	up_ext:
		Size of extension to upstream. Should be multiples of 100
	down_ext:
		Size of extension to downstream. Should be multiples of 100
	min_gene:
		minimum gene size (from TSS to TES). Should be multiples of 100
	
	'''	
	return_ranges = []
	
	for l in ireader.reader(bedfile):
		if l.startswith('#'):
			continue
		if l.startswith('track'):
			continue
		if l.startswith('browser'):
			continue
		f = l.split()
		try:
			chrom = f[0]
			start = int(f[1])
			end = int(f[2])
			symbol = f[3]
			strand = f[5]
			
			if start < 0:continue
			if start > end:
				print ("'Start' cannot be larger than 'End'. Skip: " + l, file=sys.stderr)
				continue	
			if (end - start ) < min_gene:
				continue		
			if strand not in ['+', '-']:
				print ("Unknown strand. Skip: " + l, file=sys.stderr)
				continue
		except:
			print ("BED has at lesat 6 columns. Skip: " + l, file=sys.stderr)

		if strand == '+':
			extension_st = start  - up_ext
			extension_end = end + down_ext
		elif strand == '-':
			extension_st = start  - down_ext
			extension_end = end + up_ext
		if extension_st < 0: 
			extension_st = 0
		
		#try to update extension_st
		overlaps = basal_ranges[chrom].find(extension_st, start)
		if len(overlaps) > 0:
			for o in overlaps:
				if o.end > extension_st:
					extension_st = o.end
			if extension_st > start:
				extension_st = start
		
		if (start - extension_st) < min_gene:
			continue
		
		#try to update extension_end
		overlaps = basal_ranges[chrom].find(end, extension_end)
		if len(overlaps) > 0:
			for o in overlaps:
				if o.start < extension_end:
					extension_end = o.start
			if extension_end < end:
				extension_end = end
		
		if (extension_end - end) < min_gene:
			continue
		
		return_ranges.append(([chrom, extension_st, start,symbol], [chrom, start, end,symbol], [chrom, end, extension_end,symbol], strand))
		#return_ranges.append(([chrom, extension_st, start, symbol], [chrom, start, end, symbol], [chrom, end, extension_end,symbol], strand))
		#return_ranges.append(([chrom, extension_st, start, strand], [chrom, start, end, strand], [chrom, end, extension_end, strand]))

		if printit:
			print('\t'.join([str(i) for i in (chrom, extension_st, extension_end, symbol, '0', strand,  start, end, '255,0,0', 1, extension_end - extension_st, 0)]), file = sys.stdout)
		
	return return_ranges