def exclusion_regions(blacklist_file, chip_seq_data): """ This function takes as input a bound bed file (from multiGPS). The assumption is that the bed file reports the peak center For example: chr2 45 46 It converts these peak centers into 501 base pair windows, and adds them to the exclusion list which will be used when constructing negative sets. It also adds the mm10 blacklisted windows to the exclusion list. Parameters: blacklist_file (str): Path to the blacklist file. chip_seq_data (dataFrame): The pandas chip-seq data loaded by load_chipseq_data Returns: exclusion_windows (BedTool): A bedtools object containing all exclusion windows. bound_exclusion_windows (BedTool): A bedtool object containing only those exclusion windows where there exists a binding site. """ temp_chip_file = chip_seq_data.copy() # Doesn't modify OG array. temp_chip_file['start'] = temp_chip_file['start'] - 250 temp_chip_file['end'] = temp_chip_file['end'] + 250 bound_exclusion_windows = BedTool.from_dataframe( temp_chip_file[['chr', 'start', 'end']]) blacklist_exclusion_windows = BedTool(blacklist_file) exclusion_windows = BedTool.cat( *[blacklist_exclusion_windows, bound_exclusion_windows]) return bound_exclusion_windows, exclusion_windows
def generate_remainder(whole_bed, bed_prefix, bed_list): """ Calculate the remaining regions that are not included in the truth set :param whole_bed: path to the truth regions for the whole panel :param bed_prefix: prefix used for all the bed files :param bed_list: list of all the bed files for that panel :return: BEDTool containing any regions that are completely missing from the truth regions """ whole_truth = BedTool(whole_bed) whole_truth.saveas() whole = BedTool() for bed in bed_list: print bed tool = BedTool(bed) tool.saveas() if bed == bed_list[0]: whole = tool else: whole = whole.cat(tool) whole.saveas() whole_sorted = whole.sort() whole_merged = whole_sorted.merge() whole_merged.saveas() remainder = whole_merged.subtract(whole_truth) remainder.moveto('/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.remainder.bed') missing_regions = whole_merged.subtract(whole_truth, A=True) return missing_regions
def bed_merge(list_of_beds, merged_bed): """ merge any number of bed files (merges overlapping regions) """ bed = BedTool(list_of_beds[0]) if list_of_beds[1:]: bed = bed.cat(*list_of_beds[1:]) bed.saveas(merged_bed)
def negative_shuffle_wrapper(args, include_bed, num_copies, noOverlapping): positive_windows = args[0] nonnegative_regions_bed = args[1] bigwig_files = args[2] randomseed = args[3] if num_copies > 1: positive_windows = BedTool.cat(*(num_copies * [positive_windows]), postmerge=False) negative_windows = positive_windows.shuffle(g=genome_sizes_file, incl=include_bed.fn, excl=nonnegative_regions_bed.fn, noOverlapping=noOverlapping, seed=randomseed) return negative_windows
def make_blacklist(): blacklist = BedTool(blacklist_file) blacklist = blacklist.slop(g=genome_sizes_file, b=L) # Add ends of the chromosomes to the blacklist genome_sizes_info = np.loadtxt(genome_sizes_file, dtype=str) chroms = list(genome_sizes_info[:,0]) chroms_sizes = list(genome_sizes_info[:,1].astype(int)) blacklist2 = [] for chrom, size in zip(chroms, chroms_sizes): blacklist2.append(Interval(chrom, 0, L)) blacklist2.append(Interval(chrom, size - L, size)) blacklist2 = BedTool(blacklist2) blacklist = blacklist.cat(blacklist2) return blacklist
def get_chip_beds(input_dir): chip_info_file = input_dir + '/chip.txt' chip_info = np.loadtxt(chip_info_file, dtype=str) if len(chip_info.shape) == 1: chip_info = np.reshape(chip_info, (-1, len(chip_info))) tfs = list(chip_info[:, 1]) chip_bed_files = [input_dir + '/' + i for i in chip_info[:, 0]] chip_beds = [BedTool(chip_bed_file) for chip_bed_file in chip_bed_files] print 'Sorting BED files' chip_beds = [chip_bed.sort() for chip_bed in chip_beds] if len(chip_beds) > 1: merged_chip_bed = BedTool.cat(*chip_beds) else: merged_chip_bed = chip_beds[0] return tfs, chip_beds, merged_chip_bed
class GenomicSubset(object): def __init__(self, name, path=paths.genome_subsets, assembly='hg19'): self.assembly = assembly self.name = name self.bedtool = BedTool(path + name + '.bed').sort() # Intersect the pathway with the appropriate genome build # TODO: this step should be unnecessary if the pathways are correct if name != self.assembly: self.bedtool = GenomicSubset.reference_genome( self.assembly).bedtool.intersect(self.bedtool).sort().saveas() def expand_by(self, expansion_in_each_direction_Mb): window_size_str = str(expansion_in_each_direction_Mb) + 'Mb' print('total size before window addition:', self.bedtool.total_coverage(), 'bp') # compute the flanks # TODO: use 1cM instead of 1Mb print('computing flanks') flanks = self.bedtool.flank( genome=self.assembly, b=expansion_in_each_direction_Mb*1000000).sort().merge().saveas() # compute the union of the flanks and the pathway print('computing union') union = self.bedtool.cat(flanks, postmerge=False).sort() merged = union.merge().saveas() print('total size after window addition:', merged.total_coverage(), 'bp') self.bedtool = merged def restricted_to_chrom_bedtool(self, chrnum): return self.bedtool.filter( lambda x : x[0] == 'chr' + str(int(chrnum))).saveas() @classmethod def reference_genome(cls, assembly='hg19'): return GenomicSubset(assembly, path=paths.reference, assembly=assembly) @classmethod def reference_chrom_bedtool(cls, chrnum, assembly='hg19'): return cls.reference_genome(assembly=assembly).restricted_to_chrom_bedtool(chrnum) @classmethod def whole_genome(cls, assembly='hg19'): return cls(assembly, path=paths.reference)
def get_chip_beds_multiple(input_dir): chip_info_file = input_dir + '/chip.txt' chip_info = np.loadtxt(chip_info_file, dtype=str) if len(chip_info.shape) == 1: chip_info = np.reshape(chip_info, (-1, len(chip_info))) tfs = list(chip_info[:, 1]) chip_bed_files = [input_dir + '/' + i for i in chip_info[:, 0]] chip_beds = [BedTool(chip_bed_file) for chip_bed_file in chip_bed_files] print 'Sorting BED files' chip_beds = [chip_bed.sort() for chip_bed in chip_beds] merged_chip_bed_list = [] for item in chip_beds: if 1 > 1: merged_chip_bed = BedTool.cat(*item) else: merged_chip_bed = item merged_chip_bed_list.append(merged_chip_bed) return tfs, chip_beds, merged_chip_bed_list
def get_chip_beds_multiple(input_dir,process_batch): chip_info_file = input_dir + '/chip.txt' chip_info = np.loadtxt(chip_info_file, dtype=str) if len(chip_info.shape) == 1: chip_info = np.reshape(chip_info, (-1,len(chip_info))) tfs = list(chip_info[:, 1]) chip_bed_files = [input_dir + '/' + i for i in chip_info[:,0]] chip_beds = [BedTool(chip_bed_file) for chip_bed_file in chip_bed_files] if process_batch: batch_name_list = list(np.unique(chip_info[:,1])) batch_list_all_dict = {} exchange_dict = {} for index,item in enumerate(batch_name_list): batch_tmp = [chip_beds[i] for i in list(np.where(chip_info[:,1]==item)[0])] batch_0 = batch_tmp[0] batch_tmp = batch_tmp[1:] print 'concatenate batch bedfiles for batch %d...'%(index) batch_list = batch_0.cat(*batch_tmp,postmerge=False) if item not in batch_list_all_dict.keys(): batch_list_all_dict[item] = batch_list batch_name_list_tmp = copy.deepcopy(batch_name_list) batch_name_list_tmp.remove(item) if len(batch_name_list_tmp) > 1: exchange_dict[item] = batch_name_list_tmp else: exchange_dict[item] = batch_name_list_tmp[0] else: print "Error!!!" else: print 'No need process batch,continue...' print 'Sorting BED files' chip_beds = [chip_bed.sort() for chip_bed in chip_beds] merged_chip_bed_list = [] for item in chip_beds: if 1 > 1: merged_chip_bed = BedTool.cat(*item) else: merged_chip_bed = item merged_chip_bed_list.append(merged_chip_bed) if process_batch: return tfs, chip_beds, merged_chip_bed_list,batch_list_all_dict,chip_info[:,1],exchange_dict else: return tfs, chip_beds, merged_chip_bed_list
def merge_bed(beds=None): '''Concatenates, sorts, and merges (bedtools) a list of bed files. Outputs into the tempdir directory created by TFEA Parameters ---------- beds : list or array full paths to bed files (python Path objects from pathlib) Returns ------- merged_bed : BedTool object resulting merged bed object ''' parent_bed = BedTool(str(beds[0])) for bed in beds[1:]: parent_bed = parent_bed.cat(str(bed)) merged_bed = parent_bed.sort().merge().sort() return merged_bed
def generate_remainder(whole_bed, out_dir, bed_list): """ Calculate the remaining regions that are not included in the truth set :param whole_bed: Path to the truth regions for the whole panel :type whole_bed: String :param out_dir: Prefix used for all the bed files :type out_dir: String :param bed_list: List of all the bed files for that panel :type bed_list: List of String :return: BEDTool containing any regions that are completely missing from the truth regions :rtype: BedTool """ try: whole_truth = BedTool(whole_bed) whole_truth.saveas() whole = BedTool() for bed in bed_list: print(bed) tool = BedTool(bed) tool.saveas() if bed == bed_list[0]: whole = tool else: whole = whole.cat(tool) whole.saveas() whole_sorted = whole.sort() whole_merged = whole_sorted.merge() whole_merged.saveas() remainder = whole_merged.subtract(whole_truth) remainder.moveto(out_dir + '/remainder.bed') missing_regions = whole_merged.subtract(whole_truth, A=True) except UnicodeDecodeError: missing_regions = None return missing_regions
def read_bed_regions(bed_files, chroms): """ Creates a merge region overall provided bed-files. Args: bed_files (list): Of bed-file paths (str) chroms (dict): With parents/chromosome id as key and size (bp) as value. Returns: dict: A dictionary with chromosome/parent id as key and list of Region(start, end) (named tuple) object as value. """ bed_file = BedTool(bed_files[0]) if len(bed_files) > 1: bed_file = bed_file.cat(*bed_files[1:], postmerge=False) bed_file = bed_file.sort().merge() regions = defaultdict(list) for region in bed_file: if region.chrom in chroms.keys(): regions[region.chrom].append(Region(region.start, region.end)) return regions
df['sum'] = df['sum'].astype(float).fillna(0.0) # filter by sum of E-B in the interval df2 = df.loc[df['sum'] > 0,:] # ---- Second: filter by number < 2 tmp = df['score'].str.split(',').tolist() c=[0]*len(tmp) for i,ilist in zip(range(len(tmp)),tmp): try: for j in ilist: if float(j)<= 0 : c[i]+=1 except (TypeError, ValueError): c[i] = 10 df['lt0_count'] = c df3 = df.loc[df['lt0_count'] <=2, :] # ---- Third: union of two selections: x=list(set(df2.index) | set(df3.index)) df_final = df.loc[x,:] bw_final = BedTool.from_dataframe(df_final.iloc[:,:3]) # merge the original seeds and the intervals with <2 E-B peaks = BedTool.cat(seed,bw_final).sort(g=hg19).merge(d=0) peaks.saveas(out)
def get_tf_predictive_setup(true_feature_bedtools, region_bedtool=None, ambiguous_feature_bedtools=None, bin_size=200, flank_size=400, stride=50, n_jobs=1, genome='hg19', min_bin_distance_to_chrom_edge=5000, filter_flank_overlaps=False): """ Implements the tf (and general) imputation data setup for a single sample. TODOs support chrom.sizes file for personal genomes Parameters ---------- tf_feature_peak_bedtools : list of filenames, BedTools or None items None items are treated as missing data. region_bedtools : filename or BedTool, optional If not set, union of tf_feature_peak_bedtools is used. filter_flank_overlaps : bool, default: True Labels negative bins whose flanks overlap target regions as ambiguous. ambiguous_feature_bedtools : list of filenames, BedTools or None items, optional genome : str, default: 'hg19' Can be any genome name supported by pybedtools. """ # initialize feature bedtools true_feature_bedtools = [BedTool(bedtool) if bedtool is not None else None for bedtool in true_feature_bedtools] # sanity checks if ambiguous_feature_bedtools is not None: assert len(ambiguous_feature_bedtools) == len(true_feature_bedtools) ambiguous_feature_bedtools = [BedTool(bedtool) if bedtool is not None else None for bedtool in ambiguous_feature_bedtools] # merge and bin region_bedtools if region_bedtool is not None: print(region_bedtool) region_bedtool = BedTool(region_bedtool).sort() print("Made Bedtool") region_bedtool = region_bedtool.merge() bins = bin_bed(region_bedtool, bin_size=bin_size, stride=stride) else: # use union of true peak bedtools bedtools_to_merge = [ bedtool for bedtool in true_feature_bedtools if bedtool is not None] region_bedtool = BedTool.cat( *bedtools_to_merge, postmerge=True, force_truncate=True) bins = bin_bed(region_bedtool, bin_size=bin_size, stride=stride) # throw out bins within 5kb of chromosome edge genome_chrom_sizes = getattr(genome_registry, genome) bins = bins.each(filter_by_chrom_sizes, genome_chrom_sizes, min_bin_distance_to_chrom_edge) # filter bins to chr1-22,X,Y chrom_list = ["chr%i" % (i) for i in range(1, 23)] chrom_list += ["chrX", "chrY"] bins = BedTool(bins).each(filter_interval_by_chrom, chrom_list) bins = bins.saveas() # save to temp file to enable counting num_bins = bins.count() # set genome to hg19 bins = bins.set_chromsizes(genome) # intersect bins and tf_true_peaks for true labels if n_jobs == 1: true_labels_list = [] for true_feature_bedtool in true_feature_bedtools: true_labels = bed_intersection_labels(bins, true_feature_bedtool) true_labels_list.append(true_labels) elif n_jobs > 1: # multiprocess bed intersections # save feature bedtools in temp files. Note: not necessary when inputs # are filnames true_feature_fnames = [ bedtool.fn if bedtool is not None else None for bedtool in true_feature_bedtools] true_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, fname) for fname in true_feature_fnames) true_labels = np.concatenate(true_labels_list, axis=1) bins_and_flanks = bins.slop(b=flank_size) if filter_flank_overlaps: # intersect bins and flanks for any overlap with true features if n_jobs == 1: flank_labels_list = [] for true_feature_bedtool in true_feature_bedtools: flank_labels = bed_intersection_labels( bins, true_feature_bedtool, f=10**-9, F=10**-9) flank_labels_list.append(flank_labels) elif n_jobs > 1: flank_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, bedtool.fn, f=10**-9, F=10**-9) for bedtool in true_feature_bedtools) flank_labels = np.concatenate(flank_labels_list, axis=1) # we label negative bins with any flank overlap with true features as # ambiguous true_labels[(true_labels == 0) * (flank_labels == 1)] = AMBIG_LABEL if ambiguous_feature_bedtools is not None: # intersect bins and ambiguous tfs for ambiguous labels if n_jobs == 1: ambiguous_labels_list = [] for ambiguous_feature_bedtool in ambiguous_feature_bedtools: ambiguous_labels = bed_intersection_labels( bins, ambiguous_feature_bedtool) ambiguous_labels_list.append(ambiguous_labels) elif n_jobs > 1: ambiguous_feature_fnames = [ bedtool.fn if bedtool is not None else None for bedtool in ambiguous_feature_bedtools] ambiguous_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, fname) for fname in ambiguous_feature_fnames) ambiguous_labels = np.concatenate(ambiguous_labels_list, axis=1) # we label negative bins that overlap ambiguous feature as ambiguous true_labels[(true_labels == 0) * (ambiguous_labels == 1)] = AMBIG_LABEL # TODO: do we want to also filter based on any flank overlap with # ambiguous features?? return bins_and_flanks, true_labels
def merge(regulators): """Merge a list of regulators using BedTool.cat""" if len(regulators) > 1: return BedTool.cat(*regulators, postmerge=False) else: return regulators[0]
### # name | mary lauren benton # conda_env | enh_gain-loss # created | 2019.04.04 # # this script will process the files downloaded in accessions.txt and merge # chip-seq from the same tissues # sorted and merged files saved to ./ctcf/encode_tissues/ ### from collections import defaultdict from pybedtools import BedTool name_dict = defaultdict(list) with open('accessions_mapped_filenames.txt') as infile: next(infile) for line in infile: data = line.strip().split('\t') tissue_type = data[1].replace(' ', '_').replace('\'', '').lower() name_dict[tissue_type].append(data[0]) for tissue in name_dict.keys(): num_files = len(name_dict[tissue]) for idx, bed in enumerate(name_dict[tissue]): if idx == 0: a = BedTool(bed + '.bed') else: a = a.cat(BedTool(bed + '.bed'), postmerge=True) a.sort().merge().saveas(tissue + '.bed')
def make_features_multiTask(positive_windows, y_positive, nonnegative_regions_bed, bigwig_files, bigwig_names, genome, epochs, valid_chroms, test_chroms): chroms, chroms_sizes, genome_bed = get_genome_bed() train_chroms = chroms for chrom in valid_chroms + test_chroms: train_chroms.remove(chrom) genome_bed_train, genome_bed_valid, genome_bed_test = \ [subset_chroms(chroms_set, genome_bed) for chroms_set in (train_chroms, valid_chroms, test_chroms)] positive_windows_train = [] positive_windows_valid = [] positive_windows_test = [] positive_data_train = [] positive_data_valid = [] positive_data_test = [] import pdb print 'Splitting positive windows into training, validation, and testing sets' for positive_window, target_array in itertools.izip(positive_windows, y_positive): if len(positive_window.chrom) > 8: pdb.set_trace() chrom = positive_window.chrom start = int(positive_window.start) stop = int(positive_window.stop) if chrom in test_chroms: positive_windows_test.append(positive_window) positive_data_test.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) elif chrom in valid_chroms: positive_windows_valid.append(positive_window) positive_data_valid.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) else: positive_windows_train.append(positive_window) positive_data_train.append((chrom, start, stop, shift_size, bigwig_files, [], target_array)) positive_windows_train = BedTool(positive_windows_train) positive_windows_valid = BedTool(positive_windows_valid) positive_windows_test = BedTool(positive_windows_test) import pdb print 'Getting negative training examples' negative_windows_train = BedTool.cat(*(epochs*[positive_windows]), postmerge=False) #negative_windows_train = BedTool.cat(*(10*[positive_windows]), postmerge=False) #pdb.set_trace() negative_windows_train = negative_windows_train.shuffle(g=genome_sizes_file, incl=genome_bed_train.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) print 'Getting negative validation examples' negative_windows_valid = positive_windows_valid.shuffle(g=genome_sizes_file, incl=genome_bed_valid.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) print 'Getting negative testing examples' negative_windows_test = positive_windows_test.shuffle(g=genome_sizes_file, incl=genome_bed_test.fn, excl=nonnegative_regions_bed.fn, noOverlapping=False, seed=np.random.randint(-214783648, 2147483647)) #seed=np.random.randint(-21478364, 21474836)) # Train print 'Extracting data from negative training BEDs' negative_targets = np.zeros(y_positive.shape[1]) negative_data_train = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_train] # Validation print 'Extracting data from negative validation BEDs' negative_data_valid = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_valid] # Test print 'Extracting data from negative testing BEDs' negative_data_test = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets) for window in negative_windows_test] num_positive_train_windows = len(positive_data_train) data_valid = negative_data_valid + positive_data_valid data_test = negative_data_test + positive_data_test print 'Shuffling training data' data_train = [] for i in xrange(epochs): epoch_data = [] epoch_data.extend(positive_data_train) epoch_data.extend(negative_data_train[i*num_positive_train_windows:(i+1)*num_positive_train_windows]) np.random.shuffle(epoch_data) data_train.extend(epoch_data) print 'Generating data iterators' bigwig_rc_order = get_bigwig_rc_order(bigwig_names) datagen_train = DataIterator(data_train, genome, batch_size, L, bigwig_rc_order) datagen_valid = DataIterator(data_valid, genome, batch_size, L, bigwig_rc_order) datagen_test = DataIterator(data_test, genome, batch_size, L, bigwig_rc_order) print len(datagen_train), 'training samples' print len(datagen_valid), 'validation samples' print len(datagen_test), 'test samples' return datagen_train, datagen_valid, datagen_test, data_test
def nonnegative_wrapper(a, bl_file): bl = BedTool(bl_file) a_slop = a.slop(g=genome_sizes_file, b=genome_window_size) return bl.cat(a_slop).fn
def main(argv): # DEFINIR date # ============ start = time.time() # parce arguments # =============== usage = "compare_bed.py -f <bed file 1> -F <bed file 2> -c <index chr1> -C <index chr2> -s <index start 1> -S <index start 2> -e <index end 1> -E <index end 2> -v <index value1> -V <index value2> -o <output path> -n <name of first> -N <name of second>" parser = OptionParser(usage=usage) parser.add_option("-f", "--file1", type="string", metavar="<file>", dest="f1", help="one bed file") parser.add_option("-F", "--File2", type="string", dest="f2", help="one bed file") parser.add_option("-c", "--chr", type="int", dest="chr1", help="index of chromosome for file1", default=0) parser.add_option("-C", "--CHR", type="int", dest="chr2", help="index of chromosome", default=0) parser.add_option("-s", "--start", type="int", dest="st1", help="index of start for file1", default=1) parser.add_option("-S", "--START", type="int", dest="st2", help="index of start for file2", default=1) parser.add_option("-e", "--end", type="int", dest="sp1", help="index of stop for file1", default=2) parser.add_option("-E", "--END", type="int", dest="sp2", help="index of stop for file2", default=2) parser.add_option("-v", "--value", type="int", dest="sc1", help="index of score for file1", default=4) parser.add_option("-V", "--VALUE", type="int", dest="sc2", help="index of score for file2", default=4) parser.add_option("-o", "--output", type="string", dest="output", help="prefixe of output") parser.add_option("-n", "--name1", type="string", dest="name_d1", help="name of d1") parser.add_option("-N", "--NAME2", type="string", dest="name_d2", help="name of d2") (opt, args) = parser.parse_args(argv) # check if len(argv) < 2: print HELP parser.print_help() sys.exit(1) # create log file saveout = sys.stdout fsock = open( '%s_%s.vs.%s_compare_bed.log' % (opt.output, opt.name_d1, opt.name_d2), 'w') sys.stdout = sys.stderr = fsock # tracability print "[LOG] command: ", " ".join(argv) # load data d1 = BedTool(opt.f1) d2 = BedTool(opt.f2) # format d1 = d1.sort() d2 = d2.sort() # record sorted bed f1_sorted = (opt.f1).replace(".bed", "sorted.bed") d1.saveas(f1_sorted) f2_sorted = (opt.f2).replace(".bed", "sorted.bed") d2.saveas(f2_sorted) # intersect between d1 and d2 i.e. if d1 intersect 2 differentes regions in d2 the line is duplicat d_intersect = d1.intersect(d2, wo=True, sorted=True) d_intersect.saveas(opt.output + "_intersectbed.tsv") # which d1 are not overlapping with d2 d1_specifique = d1.intersect(d2, v=True, sorted=True) d1_specifique.saveas(opt.output + "_specific_%s.tsv" % (opt.name_d1)) # which d2 are not overlapping with d1 d2_specifique = d2.intersect(d1, v=True, sorted=True) d2_specifique.saveas(opt.output + "_specific_%s.tsv" % (opt.name_d2)) # which d1 intersect d2 and how many time (last column) d1_intersect = (d1.intersect( d2, c=True, sorted=True)).filter(lambda x: int(x.fields[-1]) > 0).sort() d1_intersect.saveas(opt.output + "_intersect_%s.tsv" % (opt.name_d1)) # which d2 intersect d1 and how many time (last column) d2_intersect = (d2.intersect( d1, c=True, sorted=True)).filter(lambda x: int(x.fields[-1]) > 0).sort() d2_intersect.saveas(opt.output + "_intersect_%s.tsv" % (opt.name_d2)) # merge d1 and 2 try: # don't work on my computer d_union = d1.cat(d2, c=4, delim="|", o="collapse") except: print "[ERROR] bedtools merge must be v2.25.0 or more recent, miss some options in bedtools merge" print "[LOG] file _mergebed.tsv generated without value column" d_union = d1.cat(d2) d_union.saveas(opt.output + "_mergebed.tsv") # Jaccard #print d1.jaccard(d2) print "[LOG] jaccard test between bed file:" jaccard_res = commands.getstatusoutput("bedtools jaccard -a %s -b %s " % (f1_sorted, f2_sorted)) if jaccard_res[0] == 0 and len(jaccard_res) > 1: jaccard_res = [line.split("\t") for line in jaccard_res[1].split("\n")] print pandas.DataFrame(jaccard_res) else: print "[ERROR] Jaccard can't compute" print jaccard_res # represent score by area in venn score1_spe = [float(I.fields[opt.sc1]) for I in d1_specifique] score2_spe = [float(I.fields[opt.sc2]) for I in d2_specifique] score1_int = [float(I.fields[opt.sc1]) for I in d1_intersect] score2_int = [float(I.fields[opt.sc1]) for I in d2_intersect] data = [score1_spe, score1_int, score2_int, score2_spe] from difflib import SequenceMatcher match = SequenceMatcher(None, opt.name_d1, opt.name_d2).find_longest_match( 0, len(opt.name_d1), 0, len(opt.name_d2)) longeststring = opt.name_d1[match.a:match.a + match.size] xname = [ opt.name_d1 + "_spe".replace(longeststring, ""), opt.name_d1 + "_int".replace(longeststring, ""), opt.name_d2 + "_int".replace(longeststring, ""), opt.name_d2 + "_spe".replace(longeststring, "") ] color = ["grey", "lightblue", "lightblue", "blue"] if flag_graph: graph.boxplot(list_of_list=data, name_out=opt.output + "_boxplotscore.png", xlab="", ylab="", title="", xname=xname, color=color) # represent distribution of number bp overlapping nb_overlap = [int(I.fields[-1]) for I in d_intersect] if flag_graph: graph.hist(x=nb_overlap, label_x="overlap %s vs %s" % (opt.name_d1, opt.name_d2), name_out=opt.output + "_nbOverlap.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) # represent venn graph.my_venn2(a_specific=len(d1_specifique), b_specific=len(d2_specifique), nb_intersect=len(d1_intersect), a_label=opt.name_d1, b_label=opt.name_d2, main="total %s:%i, total %s:%i" % (opt.name_d1, len(d1), opt.name_d2, len(d2)), name_out=opt.output + "_venn_%s.png" % (opt.name_d1)) graph.my_venn2(a_specific=len(d1_specifique), b_specific=len(d2_specifique), nb_intersect=len(d2_intersect), a_label=opt.name_d1, b_label=opt.name_d2, main="total %s:%i, total %s:%i" % (opt.name_d1, len(d1), opt.name_d2, len(d2)), name_out=opt.output + "_venn_%s.png" % (opt.name_d2)) # representation pie graph.pie_fast( absolute_values=[len(d1) - len(d1_intersect), len(d1_intersect)], labels=["specific", "intersect"], main="venn %s vs %s" % (opt.name_d1, opt.name_d2), explode=None, name_out=opt.output + "_pie_%s.png" % (opt.name_d1), colors=["green", "red"]) graph.pie_fast( absolute_values=[len(d2) - len(d2_intersect), len(d2_intersect)], labels=["specific", "intersect"], main="venn %s vs %s" % (opt.name_d2, opt.name_d1), explode=None, name_out=opt.output + "_pie_%s.png" % (opt.name_d2), colors=["green", "red"]) # represent repartition of score for each score_d1 = [float(I.fields[opt.sc1]) for I in d1] if flag_graph: graph.hist(x=score_d1, label_x=opt.name_d1, name_out=opt.output + "_hist-allscore_%s.png" % (opt.name_d1), cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) score_d2 = [float(I.fields[opt.sc2]) for I in d2] graph.hist(x=score_d2, label_x=opt.name_d2, name_out=opt.output + "_hist-allscore_%s.png" % (opt.name_d2), cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) # represent size of region size_d1 = [int(I.fields[opt.sp1]) - int(I.fields[opt.st1]) for I in d1] size_d2 = [int(I.fields[opt.sp2]) - int(I.fields[opt.st2]) for I in d2] if flag_graph: graph.hist(x=size_d1, label_x=opt.name_d1, name_out=opt.output + "_hist-size_%s.png" % (opt.name_d1), cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) graph.hist(x=size_d2, label_x=opt.name_d2, name_out=opt.output + "_hist-size_%s.png" % (opt.name_d2), cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) ## represent specifique #score_d1_spe=[ float(I.fields[opt.sc1]) for I in d1_specifique ] #graph.hist(x=score_d1_spe, label_x=opt.name_d1, # name_out=opt.output+"_d1-hist-spescored1.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) #score_d2_spe=[ float(I.fields[opt.sc2]) for I in d2_specifique ] #graph.hist(x=score_d2_spe, label_x=opt.name_d2, # name_out=opt.output+"_d2-hist-spescored2.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) ## represent intersect #score_d1_int=[ float(I.fields[opt.sc1]) for I in d1_intersect ] #graph.hist(x=score_d1_int, label_x=opt.name_d1, # name_out=opt.output+"_d1-hist-intscored1.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) #score_d2_int=[ float(I.fields[opt.sc2]) for I in d2_intersect ] #graph.hist(x=score_d2_int, label_x=opt.name_d2, # name_out=opt.output+"_d2-hist-intscored2.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50) # extract information # - number of column in d1 to get first value of d2 in d_intersect nb_col1 = len(d1[0].fields) # - number of column in d2 to get overlap value in d_intersect nb_col2 = len(d2[0].fields) # - extract values from d_intersect dico_d1 = { } # to get how many d2 in one d1, and median in score d2 vs ths score of d1 dico_d2 = {} # to get how many d1 in one d2 dico_d1andd2 = {} # to get for each pair, score d1 and score d2 # - scan intersection for I in d_intersect: I = I.fields # build key for dico try: k1 = "%s:%i-%i" % (I[opt.chr1], int(I[opt.st1]), int(I[opt.sp1])) k2 = "%s:%i-%i" % (I[opt.chr2 + nb_col1], int( I[opt.st2 + nb_col1]), int(I[opt.sp2 + nb_col1])) except: print "[ERROR] ", I print opt.chr1, opt.st1, opt.sp1, opt.chr2 + nb_col1, opt.st2 + nb_col1, opt.sp2 + nb_col1 print I[opt.chr1], I[opt.st1], I[opt.sp1], I[ opt.chr2 + nb_col1], I[opt.st2 + nb_col1], I[opt.sp2 + nb_col1] else: k12 = k1 + "VS" + k2 # extract for dico_d1 if not dico_d1.has_key(k1): dico_d1[k1] = {"nb_d2": 0, "score_d2": [], "score_d1": 0} else: print "[WARNING] several key for d1:", k1 dico_d1[k1]["nb_d2"] = dico_d1[k1]["nb_d2"] + 1 dico_d1[k1]["score_d2"].append(float(I[opt.sc2 + nb_col1])) dico_d1[k1]["score_d1"] = I[opt.sc1] # extract for dico_d2 if not dico_d2.has_key(k1): dico_d2[k2] = {"nb_d1": 0, "score_d1": [], "score_d2": 0} else: print "[WARNING] several key for d2:", k2 dico_d2[k2]["nb_d2"] = dico_d2[k2]["nb_d1"] + 1 dico_d2[k2]["score_d1"].append(float(I[opt.sc1])) dico_d2[k2]["score_d2"] = I[opt.sc2 + nb_col1] ## - scan specifique d1 #for I in d1_specifique: # I=I.fields # # build key for dico # k1="%s:%i-%i" %(I[opt.chr1], int(I[opt.st1]),int(I[opt.sp1])) # if not dico_d1.has_key(k1): # dico_d1[k1]={"nb_d2":0,"score_d2":[], "score_d1":0} # else: # print "[WARNING] several key for d1:",k1 # dico_d1[k1]["score_d1"]=I[opt.sc1] ## - scan specifique d2 #for I in d2_specifique: # I=I.fields # # build key for dico # k2="%s:%i-%i" %(I[opt.chr2], int(I[opt.st2]),int(I[opt.sp2])) # if not dico_d2.has_key(k2): # dico_d2[k2]={"nb_d1":0,"score_d1":[], "score_d2":0} # else: # print "[WARNING] several key for d2:",k2 # dico_d2[k2]["score_d2"]=I[opt.sc2] # - format dico_d1 x = [] y = [] z = [] for k in dico_d1.keys(): x.append(float(dico_d1[k]["score_d1"])) if len(dico_d1[k]["score_d2"]) > 0: y.append(float(numpy.max(dico_d1[k]["score_d2"]))) else: y.append(-.09) z.append(int(dico_d1[k]["nb_d2"])) # build graph if flag_graph: graph.scatter_hist(x, y, xlabel="score %s" % (opt.name_d1), ylabel="score %s" % (opt.name_d2), main="", marker="o", color="black", alpha=0.5, size_mark=20, name_out=opt.output + "_scatterhist_%s.png" % (opt.name_d1)) # - format dico_d2 x = [] y = [] z = [] for k in dico_d2.keys(): x.append(float(dico_d2[k]["score_d2"])) if len(dico_d2[k]["score_d1"]) > 0: y.append(float(numpy.median(dico_d2[k]["score_d1"]))) else: y.append(-999.0) z.append(int(dico_d2[k]["nb_d1"])) if flag_graph: graph.scatter_hist(x, y, xlabel="score %s" % (opt.name_d2), ylabel="score %s" % (opt.name_d1), main="", marker="o", color="black", alpha=0.5, size_mark=20, name_out=opt.output + "_scatterhist_%s.png" % (opt.name_d2)) # close log sys.stdout = saveout fsock.close()
def main(): parser = argparse.ArgumentParser( description='Use a sliding window to aggregate breaks in bed file') parser.add_argument('genome', help='Name of the model used to produce input') parser.add_argument('input', help='Input .bed file with detected breaks') parser.add_argument( 'annotations', help= 'Annotation file. If annotation file has gtf or gff extention (possibly .gz) then only transcripts are selected. If .bed file is provided then all annotations from bed file are used' ) parser.add_argument('output', help='Output .bed file with longest transcripts') parser.add_argument('-w|--window-size', dest="window_size", default=int(1e5), type=int, help='Window at which to agregate breaks number') parser.add_argument('-s|--window-step', dest="window_step", default=int(1e4), type=int, help='Step after each window') parser.add_argument('-f|--features', dest="features", action="append", nargs="*", help='Additional features to annotate input file') args = parser.parse_args() start = time.time() if args.features is None: features = [] else: features = list(itertools.chain.from_iterable(args.features)) output_dir = os.path.dirname(args.output) if not os.path.exists(output_dir): os.makedirs(output_dir) print( 'Processing "{input}" using annotation="{annotation}" window {window}/{step}. Writing output to "{output}"...' .format(input=args.input, window=args.window_size, step=args.window_step, output=args.output, annotation=args.annotations)) # Create temporary files tmp = { n: tempfile.NamedTemporaryFile(delete=False).name for n in [ "genome_bin_pos", "genome_bin_neg", "genome_bin", "breaks_bin", "results", "all_transcripts", "transcripts" ] } # Create windows template for sliding window genome_bin_pos = BedTool().window_maker( genome=args.genome, w=args.window_size, s=args.window_step).each(strand, "+").saveas(tmp["genome_bin_pos"]) genome_bin_neg = BedTool().window_maker( genome=args.genome, w=args.window_size, s=args.window_step).each(strand, "-").saveas(tmp["genome_bin_neg"]) genome_bin = genome_bin_pos.cat( genome_bin_neg, postmerge=False).sort().saveas(tmp["genome_bin"]) # Read input file dna_breaks = BedTool(args.input) # Read annotation file if re.search(r"\.(gtf|gff)(\.gz)?$", args.annotations): annotations = BedTool(args.annotations) annotations = annotations.filter(filter_transcript).\ each(gff2bed, name_field="gene_id").sort().\ saveas(tmp["all_transcripts"]).\ groupby(g="1,2,3,6", c="4,5", o="distinct").\ cut([0,1,2,4,5,3]).\ saveas(tmp["transcripts"]) elif re.search(r"\.bed$", args.annotations): annotations = BedTool(args.annotations) else: parser.error( "Annotation have to be either in gtf/gff or in bed format") bin_breaks = BedTool().intersect(a=genome_bin, b=dna_breaks, wa=True, c=True, s=True). \ saveas(tmp["breaks_bin"]) # Map breaks statistics to annotation file results = BedTool().map(a=bin_breaks, b=annotations, c="4", o="distinct").cut([0, 1, 2, 7, 6, 5]).sort().saveas( tmp["results"]) # s=True, results_df = splitDataFrameList(results.to_dataframe(), "name", ",") results_df = results_df[results_df.name != "."] results_df.to_csv(args.output, sep="\t", header=True, index=False) # Remove old temporary files for f in tmp.values(): os.remove(f) end = time.time() print("Total time: {:.1f} minutes".format((end - start) / 60))
sorting.wait() return(0) def newber(binner,factor): """Returns bed record with all binding for a factor merged and renamed.""" newbie = BedTool(binner).sort().merge(nms=True).each(featurefuncs.midpoint) newbie = newbie.each(featurefuncs.rename,factor) return(newbie) print "Loading bed files..." tssbed = BedTool(tsss) jacker = BedTool(jacked) chiper = BedTool(chiped) print "Combining binding files..." comber = chiper.cat(jacker,force_truncate=False,postmerge=False).moveto('./combtemp.bed') #print "Generating oldstyle bed file..." #oldstyle = comber.each(featurefuncs.midpoint) #oldstyle2 = bedmaker(oldstyle,tssbed,oldout) print "Generating newstyle bed files..." for factor in sorted(pwmlist.keys()): print factor for model in pwmlist[factor]: print model grepper = 'grep ' + model + ' ./combtemp.bed >> ./temp1.bed' grepping = subprocess.Popen(grepper,shell=True) grepping.wait() newbie = newber('./temp1.bed',factor) newbie2 = bedmaker(newbie,tssbed,outdir + factor + '_midpoint_sorted.bed',unmatched=False)