def __init__(self, segment_list=None, cycle_list=None, ilist=None, file=None, file_content=None): if file is not None or file_content is not None: self.segment_list = hg.interval_list([]) self.segment_dict = {} self.cycle_dict = {} self.ilist = hg.interval_list([]) if file_content: lines = file_content.split('\n') else: lines = str(open(file).read().decode()).split('\n') ll = [l.strip().split() for l in lines if len(l.strip()) > 0] for l in ll: if 'Segment' == l[0]: s = hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]]) self.segment_dict[l[1]] = s self.segment_list.append(s) elif 'Cycle=' in l[0]: ls = l[0].split(';') ci = ls[0].split('=')[1] cn = float(ls[1].split('=')[1]) cl = [] for s in ls[2].split('=')[1].split(','): if s[-1] == '+': cl.append((s[:-1], 1)) else: cl.append((s[:-1], -1)) self.cycle_dict[ci] = (ci, cn, cl) elif 'Interval' == l[0]: self.ilist.append(hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]])) elif cycle_list is None: segment_set = hg.interval_list([hg.interval(ss[0], ss[1], ss[2]) for ss in {(s.chrom, s.start, s.end) for s in segment_list}]) segment_set.sort() self.segment_list = segment_set self.segment_dict = {} seg_id = {} cl = [] for s in enumerate(segment_set): self.segment_dict[str(s[0] + 1)] = s[1] seg_id[(s[1].chrom, s[1].start, s[1].end)] = str(s[0] + 1) for s in segment_list: cl.append((seg_id[(s.chrom, s.start, s.end)], s.strand)) for ii in range(len(self.segment_list)): s = self.segment_list[ii] s.info = [seg_id[(s.chrom, s.start, s.end)]] self.cycle_dict = {'1':('1', 1, cl)} self.ilist = hg.interval_list([s[0] for s in segment_set.merge_clusters(extend=1)]) for ii in range(len(self.ilist)): self.ilist[ii].info = [str(ii)] else: self.segment_list = segment_list self.segment_dict = {s.info[0]: s for s in segment_list} self.cycle_dict = {c[0]:c for c in cycle_list} if ilist is not None: self.ilist = ilist else: self.ilist = hg.interval_list([s[0] for s in segment_list.merge_clusters(extend=1)]) for ii in range(len(self.ilist)): self.ilist[ii].info = [str(ii)]
def load_aa_result(in_dir, prefix): summary_file = "%s/%s_summary.txt" % (in_dir, prefix) all_map = {} if os.path.exists(summary_file): amps = read_summary_file(summary_file) if amps is None: return None else: return None for (ai, a) in amps.items(): ints = a['Intervals'].split(',') cycle_file = "%s/%s_amplicon%s_cycles.txt" % (in_dir, prefix, ai) (segment_map, interval_map, cycle_map) = read_cycle_file(cycle_file) all_map.setdefault(ai, {})['segment_map'] = segment_map all_map.setdefault(ai, {})['interval_map'] = interval_map all_map.setdefault(ai, {})['cycle_map'] = cycle_map for cycle in cycle_map.keys(): segments = hg19.interval_list([ segment_map[c[0:-1]] for c in cycle_map[cycle]['cycle'] if c[0] != '0' ]) for s in segments: s.info.setdefault('copy_count', 0) s.info['copy_count'] += cycle_map[cycle]['copy_count'] all_map['amplicons'] = amps return all_map
def parse_bed(segment_file): input = open(segment_file, 'r') amplicons = hg19.interval_list() for line in input: res = line.strip().split('\t') amplicons.append(hg19.interval(res[0], int(res[1]), int(res[2]))) return amplicons
def build_segments(self, bed_data = None): if bed_data is None: bed_data = self.bed_data points_x = [] points_y = [] colors = [] fpoints_x = [] fpoints_y = [] fcolors = [] previous_end = total_length_with_spacing*(global_rot/360.0) for ind,sp in enumerate(start_points): start_point = int(previous_end - sp) start_angle = start_point/total_length_with_spacing*360 end_angle = (start_point - lens[ind])/total_length_with_spacing*360 #segseqD referenced as global variable here because I'm lazy segment = segSeqD[cycle[ind][0]] strand = cycle[ind][1] hits = [h[0] for h in bed_data.intersection([segment])] if self.color_bed is not None: color_subhits = hg19.interval_list([h[0] for h in self.color_bed.intersection([segment])]) for h in hits: for pos in xrange(h.start, h.end, self.point_spacing): if pos > segment.end or pos < segment.start: continue if self.color_bed is not None: temp = hg19.interval(h.chrom, pos, pos) color_hits = color_subhits.intersection([temp],self.point_spacing) if len(color_hits) != 0: color = color_hits[0][0].info['color'] else: color = self.color if 'color' not in h.info else h.info['color'] else: color = self.color if 'color' not in h.info else h.info['color'] if strand == "+": normStart = start_point - max(0,pos-segment.start) normEnd = start_point - min(segment.end-segment.start,pos-segment.start) else: normEnd = start_point - min(segment.end-segment.start,segment.end-pos) normStart = start_point - max(0,segment.end - pos) hvalue = h.info['value'] if h.info['value'] > self.ymin else self.ymin hvalue = hvalue if hvalue < self.ymax else self.ymax y_scale_value = (1.*hvalue-self.ymin)/(self.ymax-self.ymin) if self.is_log: y_scale_value = (math.log10(hvalue)-math.log10(self.ymin))/(math.log10(self.ymax)-math.log10(self.ymin)) r_scale_value = y_scale_value*(self.track_rmax-self.track_rmin)+self.track_rmin x_s,y_s = pol2cart(r_scale_value,normStart/total_length_with_spacing*2*np.pi) if 'fill' in h.info: foo = fpoints_x.append(x_s) foo = fpoints_y.append(y_s) fcolors.append(color) else: foo = points_x.append(x_s) foo = points_y.append(y_s) colors.append(color) foo = ax.scatter(points_x,points_y,marker='o',s=1,linewidths=0.01,facecolors='none',color=colors) foo = ax.scatter(fpoints_x,fpoints_y,marker='*',s=1,linewidths=0.01,color=fcolors)
def build_genebed_from_fpkm(fpkm): fpkm_bed = hg19.interval_list() for (g,f) in fpkm.items(): if g not in ensembl_grc37_map: continue gene = ensembl_grc37_map[g] fpkm_bed.append(hg19.interval(gene.chrom, gene.start, gene.end, info={'value':f, 'name':gene.info['Name']})) fpkm_bed.sort() return fpkm_bed
def classify_amplicon(amplicon, threshold_copy=4, min_length=50000): #Check for cyclic cycle, no threshold intervals = hg19.interval_list( [i for i in amplicon['interval_map'].values()]) intervals.sort() cycles = amplicon['cycle_map'].keys() cycles = sorted(cycles, key=lambda x: amplicon['cycle_map'][x]['copy_count'], reverse=True) iscycle = False iscomplex = False for cycle in cycles: segments = hg19.interval_list([ amplicon['segment_map'][s[0:-1]] for s in amplicon['cycle_map'][cycle]['cycle'] if s[0:-1] != '0' ]) segments.sort() hits = hg19.interval_list( Set([h[0] for h in intervals.intersection(segments)])) hits.sort() chrs = Set([s.chrom for s in segments]) if len(chrs) > 1 or len(hits) > 1: iscomplex = True #Sometimes the length is 0, not sure why length = sum([s.end - s.start for s in segments]) if length == 0: continue copies = sum([(s.end - s.start) * s.info['copy_count'] for s in segments]) / length #Check for cycles if amplicon['cycle_map'][cycle]['cycle'][0][0] != '0': length = sum([s.end - s.start for s in segments]) #check is coverage across cycle is greater than threshold copies = sum([(s.end - s.start) * s.info['copy_count'] for s in segments]) / length if copies < threshold_copy or length < min_length: continue iscycle = True if iscycle: return 'Cyclic' elif iscomplex: return 'Complex' else: return 'Amplification'
def load_bed(bed_file, value = None, log = False, sep='\t'): bed_data = hg19.interval_list() for line in open(bed_file): res = line.split(sep) if value is None: bed_data.append(hg19.interval(res[0], int(res[1]), int(res[2]), info={'value':float(res[3]) if not log else 10**float(res[3])})) else: bed_data.append(hg19.interval(res[0], int(res[1]), int(res[2]), info={'value':value})) bed_data.sort() return bed_data
def load_ensembl_grc37(): input = open('/pedigree2/projects/namphuon/data/references/hg19/annotations/Homo_sapiens.GRCh37.64.gtf' ,'r') ensemble_data = hg19.interval_list() ensembl_grc37_map = {} for line in input: res = line.split('\t') info = dict([r.strip().replace('"','').split(' ') for r in res[-1].strip().split('; ') if len(r.split(' ')) == 2]) temp = hg19.interval("chr%s" % res[0],int(res[3]),int(res[4]),info={'data':info}) ensemble_data.append(temp) foo = ensembl_grc37_map.setdefault(info['gene_id'],[]).append(temp) ensembl_grc37 = hg19.interval_list() for g in ensembl_grc37_map: start = min([e.start for e in ensembl_grc37_map[g]]) end = max([e.end for e in ensembl_grc37_map[g]]) name = [e.info['data']['gene_name'] for e in ensembl_grc37_map[g] if 'gene_name' in e.info['data']] name = name[0] if len(name) >= 1 else g ensembl_grc37.append(hg19.interval("%s" % e.chrom, start, end, info={'intervals':ensembl_grc37_map[g],'GeneID':g,'Name':name})) input.close() ensembl_grc37.sort() for e in ensembl_grc37: ensembl_grc37_map[e.info['GeneID']] = e return (ensembl_grc37, ensembl_grc37_map)
def get_cyclic_path(amplicon, threshold=10000): paths = hg19.interval_list() for cycle in amplicon['cycle_map'].keys(): length = sum([ amplicon['segment_map'][s[0:-1]].end - amplicon['segment_map'][s[0:-1]].start for s in amplicon['cycle_map'][cycle]['cycle'] if s[0] != '0' ]) if (amplicon['cycle_map'][cycle]['cycle'][0][0] != '0' and length >= threshold): paths.extend([ amplicon['segment_map'][s[0:-1]] for s in amplicon['cycle_map'][cycle]['cycle'] ]) return paths
def find_peaks(bed_data, window = 1000): bed_data.sort() current = bed_data[0] keeps = hg19.interval_list() start = current for i in xrange(0,len(bed_data)): if bed_data[i].chrom != start.chrom: keeps.append(current) current = bed_data[i] start = current elif bed_data[i].intersects(start,10000) and bed_data[i].info['value'] > current.info['value']: current = bed_data[i] elif not bed_data[i].intersects(start,window): keeps.append(current) current = bed_data[i] start = current keeps.append(current) return keeps
"unable to set AA_DATA_REPO variable. Setting to working directory") DATA_REPO = '.' if DATA_REPO == '.' or DATA_REPO == '': logging.warning( "#TIME " + '%.3f\t' % (clock() - TSTART) + "AA_DATA_REPO not set or empy. Setting to working directory") DATA_REPO = '.' logging.info("#TIME " + '%.3f\t' % (clock() - TSTART) + "Loading libraries and reference annotations for: " + args.ref) import hg19util as hg import bam_to_breakpoint as b2b logging.info("#TIME " + '%.3f\t' % (clock() - TSTART) + "Initiating bam_to_breakpoint object for: " + args.bam) rdList0 = hg.interval_list(rdAlts, 'bed', exclude_info_string=True) rdList = hg.interval_list([r for r in rdList0]) coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats") cstats = None cb = bamFile if cbam is not None: cb = cbam for l in coverage_stats_file: ll = l.strip().split() if ll[0] == os.path.abspath(cb.filename): cstats = tuple(map(float, ll[1:])) coverage_stats_file.close() coverage_windows = None if cbed is not None: coverage_windows = hg.interval_list(cbed, 'bed') coverage_windows.sort()
args = parser.parse_args() global_names.REF = args.ref import hg19util as hg if args.bed != '': rdAlts = args.bed if args.out != '': outname = args.out + ".bed" else: outname = os.path.splitext(rdAlts)[0] + "_amplified.bed" GAIN, CNSIZE_MIN = args.gain, args.cnsize_min rdList0 = hg.interval_list(rdAlts, 'bed') if rdList0: try: if len(rdList0[0].info) == 0: sys.stderr.write( "ERROR: CNV estimate bed file had too few columns.\n" "Must contain: chr pos1 pos2 cnv_estimate\n") sys.exit(1) _ = float(rdList0[0].info[-1]) except ValueError: sys.stderr.write( "ERROR: CNV estimates must be in last column of bed file.\n") sys.exit(1) rdList = hg.interval_list([r for r in rdList0 if float(r.info[-1]) > GAIN])
else: samp_name = args.sname.rsplit("/")[-1] fname = samp_name bed_feat_dict = {} if args.bed_files: for i,j in zip(args.bed_files,args.feature_labels): print j,i #feature name -> chromosome -> ordered list of positions bed_list = parse_bed_file(i) bed_feat_dict[j] = feat_bed_to_lookup(bed_list) outer_bar = max(bed_track_height*(len(bed_feat_dict)+2),10) bed_data = hg19.interval_list([hg19.interval('chr8', 127638302, 127938302, info={'value':int(random.random()*100)}), hg19.interval('chr8', 128716346,128746346, info={'value':int(random.random()*100)})]) bed_data.sort() args.prefix_name = '/pedigree2/projects/namphuon/programs/CycleViz/COLO320DM' args.cycles_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/onco_amplicon1_cycles.txt' args.fpkm_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/colo320dm.fpkm.csv' args.wgs_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/colo320dm.wgs.1000.pileup.log.bed' cycles_numbers = ['6', '9', '10', '12', '13', '14', '15', '16','19'] args.atac_peak_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/ATAC-seq/SRC1655_summits_250ext_q1e6_nochrM_merged.bed' args.atac_file = '/pedigree2/projects/namphuon/results/paul_gbm39/ATAC/COLO320DM.atac.1000.pileup.log.bed' args.prefix_name = '/pedigree2/projects/namphuon/programs/CycleViz/PC3' args.cycles_file = '/nucleus/pedigree/projects/extrachromosome/data/turner2017/reconstruction/run14/FF-77_amplicon4_cycles.txt' args.fpkm_file = '/pedigree2/projects/namphuon/results/paul_gbm39/rnaseq/PC3.fpkm.csv' args.wgs_file = '/pedigree2/projects/namphuon/results/paul_gbm39/PC3/PC3.wgs.1000.pileup.log.bed'
global_names.REF = args.ref import hg19util as hg if args.bed != '': rdAlts = args.bed if args.out != '': outname= args.out + ".bed" else: outname = os.path.splitext(rdAlts)[0] + "_amplified.bed" GAIN,CNSIZE_MIN = args.gain,args.cnsize_min rdList0 = hg.interval_list(rdAlts, 'bed') if rdList0: try: if len(rdList0[0].info) == 0: sys.stderr.write("ERROR: CNV estimate bed file had too few columns.\n" "Must contain: chr pos1 pos2 cnv_estimate\n") sys.exit(1) _ = float(rdList0[0].info[-1]) except ValueError: sys.stderr.write("ERROR: CNV estimates must be in last column of bed file.\n") sys.exit(1) rdList = hg.interval_list([r for r in rdList0 if float(r.info[-1]) > GAIN ]) if args.bam != "":
from collections import defaultdict import pysam import hg19util as hg f = pysam.AlignmentFile("/pedigree2/projects/namphuon/data/SCC090/pacbio/merged.bam") segs = defaultdict(lambda: [], {}) readlen = {} refi = hg.interval_list([hg.interval(i) for i in f.references]) segi = 1 qi = 0 qindex = {} qlist = [] for l in f.fetch(): ref = l.reference_name.split(':')[0] ref_start = int(l.reference_name.split(':')[1].split('-')[0]) + l.reference_start ref_end = int(l.reference_name.split(':')[1].split('-')[0]) + l.reference_end qstart = l.query_alignment_start qend = l.query_alignment_end if l.query_name not in qindex: qindex[l.query_name] = qi qlist.append(l.query_name) qi += 1 if l.is_reverse: qstart = l.infer_query_length() - l.query_alignment_end qend = l.infer_query_length() - l.query_alignment_start
coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats") cstats = None cb = bamFile if cbam is not None: cb = cbam for l in coverage_stats_file: ll = l.strip().split() if ll[0] == os.path.abspath(cb.filename): cstats = tuple(map(float, ll[1:])) coverage_stats_file.close() coverage_windows=None if cbed is not None: coverage_windows=hg.interval_list(cbed, 'bed') coverage_windows.sort() if cstats is None and cbam is not None: cbam2b = b2b.bam_to_breakpoint(cbam, coverage_stats=cstats, coverage_windows=coverage_windows) cstats = cbam2b.basic_stats elif cstats is None: bamFileb2b = b2b.bam_to_breakpoint(bamFile, coverage_stats=cstats, coverage_windows=coverage_windows) cstats = bamFileb2b.basic_stats final = args.final if cstats[0] <= final: exit() ratio = float(final) / float(cstats[0])
metavar='FILE', action='store', type=str, nargs=1, default=[]) args = parser.parse_args() rdAltsl = [] if args.bed[0] != '': rdAltsl.append(args.bed[0]) elif len(args.bedlist) != 0 and args.bedlist[0] != '': for l in open(args.bedlist[0]): rdAltsl.append(l.strip()) for rdAlts in rdAltsl: rdList0 = hg.interval_list(rdAlts, 'bed') rdList = hg.interval_list([r for r in rdList0 if float(r.info[1]) > GAIN]) if args.bam != "": import bam_to_breakpoint as b2b if os.path.splitext(args.bam[0])[-1] == '.cram': bamFile = pysam.Samfile(args.bam[0], 'rc') else: bamFile = pysam.Samfile(args.bam[0], 'rb') coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats") cstats = None cb = bamFile for l in coverage_stats_file: ll = l.strip().split() if ll[0] == os.path.abspath(cb.filename): cstats = tuple(map(float, ll[1:]))
metavar='FILE', action='store', type=str, nargs=1) args = parser.parse_args() rdAlts = args.rdAlts[0] bamFile = pysam.Samfile(args.bam[0], 'rb') outName = args.outName[0] logging.basicConfig(filename=outName + '.log', level=logging.DEBUG) logging.info("#TIME " + str(clock()) + " import done") summary_logger = logging.getLogger('summary') summary_logger.addHandler(logging.FileHandler(outName + '_summary.txt', 'w')) graph_logger = logging.getLogger('graph') cycle_logger = logging.getLogger('cycle') rdList0 = hg.interval_list(rdAlts, 'bed') rdList = hg.interval_list([r for r in rdList0]) coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats") cstats = None for l in coverage_stats_file: ll = l.strip().split() if ll[0] == os.path.abspath(bamFile.filename): cstats = tuple(map(float, ll[1:])) coverage_stats_file.close() coverage_windows = None # coverage_windows=hg.interval_list('universal_coverage_estimation_rep1_seq_coords_hg19.tsv', 'bed') # coverage_windows.sort() bamFileb2b = b2b.bam_to_breakpoint(bamFile, coverage_stats=cstats, coverage_windows=coverage_windows) # exit()
def draw_episome(self, input_files, output_file=None, auto_scale=0): cycles_section_top = 30 cycles_section_size = 0 space_between_decompositions = 90 / (1 + auto_scale) bottoms = [] for i in range(len(input_files)): if i != 0: cycles_section_size += space_between_decompositions input_content = input_files[i][1] intervals, segments, seg_name_to_index_map, cycles, directions, chr_offs, copy_counts, cycles_names = self.readDataFile( input_content) number_of_element = sum(len(x) for x in cycles) cycles_section_size += number_of_element * 10 for cycle in cycles: if cycle[-1] != 0: cycles_section_size += 10 bottoms.append(cycles_section_top + cycles_section_size) tops = [cycles_section_top] + [ bottom + space_between_decompositions for bottom in bottoms[:-1] ] cycles_section_bottom = cycles_section_top + cycles_section_size # print ('cycle section_top:', cycles_section_top) # print ('cycle section_bottom:', cycles_section_bottom) for i in range(len(input_files)): input_content = input_files[i][1] self.file_names.append( Text('%s: %s' % (str(i + 1), input_files[i][0]), 0.5, tops[i] - 27)) # print ('bottom:', bottoms[i]) # print ('top:', tops[i]) intervals, segments, seg_name_to_index_map, cycles, directions, chr_offs, copy_counts, cycles_names = self.readDataFile( input_content) if i == 0: self.reconstructed_cycles = [cname for cname in cycles_names] self.reconstructed_segments = [ segment_count for segment_count in range(len(segments)) ] self.compute_chr_offsets(chr_offs) if len(intervals) == 0: intervals = self.compute_intervals(segments) ilist = hg.interval_list([ hg.interval(chr_name, start_point, end_point) for chr_name, start_point, end_point in intervals ]) maxIntvl = self.findMaxIntervals(segments) sortedL = {} compact = {} span = {} for ch in maxIntvl.keys(): sortedL[ch] = self.makeListOfSegmentEndPoints(segments, ch) sortedCopy = list(sortedL[ch]) mergeL = self.mergeIntervals(sortedCopy) compact[ch], span[ch] = self.compactIntervals( mergeL, maxIntvl[ch]) newsegs = self.convertSegmentCoordinates(segments, span, compact, maxIntvl, ilist) if i == 0: self.drawSections(intervals, ilist, cycles_section_top, cycles_section_bottom, auto_scale) # self.drawAxesLabels(sortedL, compact, maxIntvl, span, chr_offs, bottoms[i]) # self.drawAxes(span, compact, maxIntvl, chr_offs, bottoms[i]) # self.drawAxesDottedLines(sortedL, compact, maxIntvl, span, chr_offs, tops[i], bottoms[i]) self.drawCycles(newsegs, seg_name_to_index_map, cycles, directions, tops[i], chr_offs, i, copy_counts, cycles_names)
reffile.close() except: logging.warning( "#TIME " + '%.3f\t' % clock() + "unable to set reference in $AA_DATA_REPO/reference.txt. Setting in working directory." ) logging.info("#TIME " + '%.3f\t' % clock() + " Loading libraries and reference annotations for: " + args.ref) import hg19util as hg import bam_to_breakpoint as b2b from breakpoint_graph import * logging.info("#TIME " + '%.3f\t' % clock() + " Initiating bam_to_breakpoint object for: " + args.bam[0]) rdList0 = hg.interval_list(rdAlts, 'bed') rdList = hg.interval_list([r for r in rdList0]) coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats") cstats = None cb = bamFile if cbam is not None: cb = cbam for l in coverage_stats_file: ll = l.strip().split() if ll[0] == os.path.abspath(cb.filename): cstats = tuple(map(float, ll[1:])) coverage_stats_file.close() coverage_windows = None if cbed is not None: coverage_windows = hg.interval_list(cbed, 'bed') coverage_windows.sort()
parser = argparse.\ ArgumentParser(description="Cycles File") parser.add_argument('--cycles', dest='cycles_file', help="File listing cycles in amplicon", metavar='FILE', action='store', type=str, nargs=1) args = parser.parse_args() cycles_file = args.cycles_file[0] ll = [l.strip().split() for l in open(cycles_file) if len(l.strip()) > 0] segments = hg.interval_list([ hg.interval(l[2], int(l[3]), int(l[4]), info=[int(l[1])]) for l in ll if l[0] == 'Segment' ]) for s in segments: if s.chrom[:3] == 'chr': s.info.append('Human') else: s.info.append('Viral') segments.sort() segment_id_dict = {s.info[0]: s for s in segments} cycles = [] for c in [l[0].split(';') for l in ll if 'Cycle=' in l[0]]: c_dict = {cc.split('=')[0]: cc.split('=')[1] for cc in c} new_dict = {} new_dict['Cycle'] = int(c_dict['Cycle']) new_dict['Copy_count'] = float(c_dict['Copy_count'])
sys.setrecursionlimit(10000) import argparse import hg19util as hg GAIN = 5 CNSIZE_MIN = 100000 parser = argparse.\ ArgumentParser(description="Filter and merge amplified intervals") parser.add_argument('--bed', dest='bed', help="Bed file with list of amplified intervals", metavar='FILE', action='store', type=str, nargs=1) args = parser.parse_args() rdAlts = args.bed[0] rdList0 = hg.interval_list(rdAlts, 'bed') rdList = hg.interval_list([r for r in rdList0 if float(r.info[1]) > GAIN ]) genome_features = hg.oncogene_list amplicon_listl = rdList amplicon_listl = hg.interval_list([a for a in amplicon_listl if a.size() > CNSIZE_MIN]) amplicon_listl.sort() cr = hg.conserved_regions uc_list = hg.interval_list([]) for a in amplicon_listl: if (len(hg.interval_list([a]).intersection(cr)) == 0 or a.size() > max(1000000, 10 * sum([a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr)])) or a.size() - sum([a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr)]) > 2000000):