'score']].drop_duplicates().sort_values(by=['chrom', 'start']) elif strd == True: bed['name'] = 'range_id_R' + (bed['index'] + 1).astype(str) + '_' + \ bed['chrom'].astype(str) + '_' + \ bed['start'].astype(str) + '_' + \ bed['end'].astype(str) + '_' + \ bed['strand'].astype(str) bed = bed[['chrom', 'start', 'end', 'name', 'score', 'strand']].drop_duplicates().sort_values(by=['chrom', 'start']) if not args.debug: pass else: bed = bed.head(args.debug) BedTool.from_dataframe(bed).saveas(args.outfile + '.datamatrix/genomic_ranges.bed') print() print("Generating FASTA sequences for each entry") if args.create_fastas == True: Popen('mkdir -p ' + args.outfile + '.datamatrix/temp/fastas', shell=True) entry_list = list(range(len(bed))) if __name__ == '__main__': p = Pool((args.ncores)) p.map(make_fasta, entry_list) else:
for k in chromosomes.keys() } for k in chromosomes.keys(): print chrom_Intervals[k][0:10] with open('humanBedInterval.bed', 'w') as f: for k in chromosomes.keys(): f.write('\n'.join( '%s\t%d\t%d' % (k, chrom_Intervals[k][i], chrom_Intervals[k][i + 1] - 1) for i in range(len(chrom_Intervals[k]) - 1)) + '\n') a = BedTool('humanBedInterval.bed').sort() b = BedTool('/mnt/disks/data-vcf/GSN79Tumor_normal.vcf') print a.head() print b.head() a.coverage(b).saveas('VCFCoverage.bed') #,hist=True #for k in chromosomes.keys(): positionHistogram = defaultdict(list) with open('/mnt/disks/data-vcf/GSN79Tumor_normal.vcf', 'r') as f: for line in f.readlines(): if line and line.startswith('#') == 0: if line.split('\t')[0] not in positionHistogram.keys(): positionHistogram[line.split('\t')[0]] = [] positionHistogram[line.split('\t')[0]].append( int(line.split('\t')[1]))
class Bed: """ The Bed class is a wrapper to the pybedtools.BedTool class. It handles the temporary file internally, and provides interface to several BedTool methods. Attributes ---------- fn: string The temporary filename sitting on disk that holds contents of this Bed() object. istmp: bool Flag for whether this Bed() object is temporary. If temporary, self.fn file will be deleted upon destruction. __len__: int Number of intervals contained in this Bed() object. """ def __init__(self, fn=None): """ Initializer of Bed class. Parameters ---------- fn: None, BedTool, string, tuple, list, generator None: Empty Bed() object BedTool: wraps input existing BedTool object. string: filename, wraps the input file with name {fn}. list/tuple/generator: loops over the input fn, writing as an temporary file, wraps it. Init with this type will automatically set self.istmp=True """ self.istmp = False # flag, set to true to delete the bed file during destruction if fn == None: self.bedtool = BedTool() elif isinstance(fn, BedTool): self.bedtool = fn elif isinstance(fn, str): self.bedtool = BedTool(fn) elif isinstance(fn, collections.Iterable): tmpfn = write_tmp_bed(fn) self.bedtool = BedTool(tmpfn) self.istmp = True else: self.bedtool = None raise NotImplementedError( f'Initializing with type {type(fn)} is not implemented') @property def fn(self): return self.bedtool.fn def __len__(self): if not self.fn: return 0 with open(self.fn) as fn: length = sum(1 for line in fn) return length def __str__(self): return self.bedtool.__str__() def __getitem__(self, key): return self.bedtool.__getitem__(key) def __iter__(self): return self.bedtool.__iter__() def __eq__(self, target): if len(self) != len(target): return False elif len(self) == len(target) == 0: return True else: return self.bedtool == target.bedtool def _delete_tmp(self): if self.istmp == True: try: os.unlink(self.fn) except FileNotFoundError: pass def __del__(self): self._delete_tmp() del self.bedtool def head(self): """ Wrapper for BelTool.head() method, which prints the first few intervals. """ self.bedtool.head() def copy(self): """ Create a temporary copy of the Bed() object. """ newfn = tempfile.mkstemp(suffix='.bed', prefix='sirius_')[1] shutil.copyfile(self.fn, newfn) newBed = Bed(newfn) newBed.istmp = True return newBed def extend(self, d): """ Modify the intervals by extending each to left and right by {d} """ self.extend_asym(d, d) def extend_asym(self, dl, dr): """ Extend the range of each interval, by dl to left and dr to right """ # create a new file with extended range def gen_iv_ext(): # we use a generator to for iv in self.bedtool: chrom, start, end, name, score, strand, _id, tp = iv.fields[:8] yield (chrom, int(start) - dl, int(end) + dr, name, score, strand, _id, tp) newfn = write_tmp_bed(gen_iv_ext()) # delete the old tmp file self._delete_tmp() # switch to the new file self.bedtool = BedTool(newfn) # we are now using a tmp file self.istmp = True def gids(self): """ Return a set of gnode ids from self.bedtool """ return set(iv[3] for iv in self.bedtool) def intersect(self, b): """ intersect method wrapps BedTool.intersect() Parameters ---------- b: Bed Target Bed() object to be intersected with Returns ------- c: Bed Temporary Bed() object that contains all intervals in {self} which intersects with {b}. """ c = Bed() c.bedtool = self.bedtool.intersect(b.bedtool, u=True) c.istmp = True return c def window(self, b, window=1000): """ window method wrapps BedTool.window() Parameters ---------- b: Bed Target Bed() object to be windowed with window: int, default 1000 The window size Returns ------- c: Bed Temporary Bed() object that contains all intervals in {self} which is within {window} from any interval in {b}. """ c = Bed() c.bedtool = self.bedtool.window(b.bedtool, w=window, u=True) c.istmp = True return c