def __init__(self): # read CTCF peaks self.peaks = pandas.read_table(FLAGS.data_path, header=None) self.peaks.columns = 'chr start end name score expCount expNums expScores'.split() self.peaks = self.peaks[self.peaks.name.isin(['CTCF'])] print(self.peaks.head()) widths = self.peaks.end - self.peaks.start # only one of the peaks is actually not 150 wide print('Getting genome.') self.genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit') # positives - center of each peak with 256(defalut) bp window # negatives - positives shifted left/right by 1024 (masked to not overlap any positives) # negatives - shuffled positives sequences # for testing - hold out chr8 prediction_window = FLAGS.input_window self.half_window = prediction_window // 2 self.negative_shift = prediction_window * 4 self.num_training_examples = sum(self.peaks.chr != 'chr8') print('Number of training examples: ' + str(self.num_training_examples)) # build intervaltrees for peaks to make sure our negatives (shifted positives) # are true negatives print('Building itrtree') self.peak_intervals = {chr: IntervalTree() for chr in self.peaks.chr.unique()} for chr in self.peaks.chr.unique(): self.peak_intervals[chr][len(self.genome[chr]):len(self.genome[chr])+1] = 1 for idx, row in tqdm(self.peaks.iterrows()): self.peak_intervals[row.chr][(row.start - self.half_window):(row.end + self.half_window)] = 1
def predict_snv(self, peaks, genome=None, act=False): """Predict from a bed file with chr, position, refAllele, altAllele. Arguments: peaks -- the bed file in pd table form. Keywords: genome -- default is hg19. Outputs: refpreds -- predictions for each row with reference allele. altpreds -- predictions for each row with alternate allele. """ # get the genome and bed file regions if genome == None: genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit') # predict over the rows refpreds = list() batchgen = train_TFmodel.filled_batch(snv_gen(peaks, genome, alt=False)) for batch in batchgen: if act: refpreds.append(self.get_act([batch, 0])) else: refpreds.append(self.model.predict_on_batch(batch)) refpreds = np.asarray(refpreds).flatten()[:len(peaks)] altpreds = list() batchgen = train_TFmodel.filled_batch(snv_gen(peaks, genome, alt=True)) for batch in batchgen: if act: altpreds.append(self.get_act([batch, 0])) else: altpreds.append(self.model.predict_on_batch(batch)) altpreds = np.asarray(altpreds).flatten()[:len(peaks)] return refpreds, altpreds
def predict_bed(self, peaks, genome=None): """Predict from a bed file. Arguments: peaks -- from the bed file. Keywords: genome -- default is hg19. Outputs: preds -- predictions for each row. """ # get the genome and bed file regions if genome == None: genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit') # predict over the rows preds = list() for index, row in tqdm(peaks.iterrows()): tile, pred = self.localize(row, genome) preds.append(pred) return np.asarray(preds).flatten()
def __init__(self, regions='/home/kal/data/chipseq_ctcf_peaks.bed', cellline_CTCFs=None, cellline_DHSs=None, keys=None, input_window=FLAGS.input_window, batch_size=FLAGS.batch_size, column_names='chr start end . . . . .'): """ Create a generator for CTCFDHS data. Arguments: regions -- bed file with training points to use. cellline_CTCF -- list of CTCF peak files, one for each cell line to use. cellline_DHS -- list of files with DHS reads, one for each cell line to use. matching order to cellline_CTCF. """ # read in the regions self.bed = pd.read_table(regions, header=None) self.bed.columns = column_names.split() # set up a reference genome. self.genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit') # define some values self.batch_size = batch_size self.input_window = input_window self.half_window = input_window // 2 # set up cell line stuff. if cellline_CTCFs == None: cellline_CTCFs = ['/home/kal/CTCF/ATAC_CTCF/data/K562/wgEncodeBroadHistoneK562CtcfStdAlnRep1.bam', '/home/kal/CTCF/ATAC_CTCF/data/HUVEC/wgEncodeBroadHistoneHuvecCtcfStdAlnRep1.bam', '/home/kal/CTCF/ATAC_CTCF/data/HCT116/CTCF_untreated.reheader.bam'] if cellline_DHSs == None: cellline_DHSs = ['/home/kal/CTCF/ATAC_CTCF/data/K562/CombinedScreens.unique_alignment.bam', '/home/kal/CTCF/ATAC_CTCF/data/HUVEC/wgEncodeUwDnaseHuvecAlnRep2.bam', '/home/kal/CTCF/ATAC_CTCF/data/HCT116/wgEncodeUwDnaseHct116AlnRep1.bam'] self.CTCF_dict = dict() self.DHS_dict = dict() self.cov_dict = dict() if keys == None: keys = range(len(cellline_CTCFs)) self.keys = keys for key, CTCF, DHS in zip(self.keys, cellline_CTCFs, cellline_DHSs): self.CTCF_dict[key] = pysam.AlignmentFile(CTCF,'rb') self.DHS_dict[key] = pysam.AlignmentFile(DHS,'rb') total_bases = sum([len(self.genome[chr]) for chr in self.DHS_dict[key].references]) covered_bases = self.DHS_dict[key].count() self.cov_dict[key] = covered_bases/total_bases
TF -- the transcription factor to filter for. example_limit -- the minimum number of examples to bother with. scrambled -- the size of the -mers to consider independent units when scrambeling. <<<<<<< HEAD shift -- use shifted samples? score_columns -- which columns to put as the score ======= >>>>>>> 6e0e7265b8151677f97b65e8e05edf15e0cf7599 """ # read TF peaks full = pd.read_table(bed_path, header=None) if columns == None: columns = 'chr start end name score expCount expNums expScores' full.columns = columns.split() peaks = full[full.name.isin([TF])] genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit') # positives - center of each peak with 256(defalut) bp window # negatives - positives shifted left/right by 1024 (masked to not overlap any positives) # negatives - shuffled positives sequences # for testing - hold out chr8 prediction_window = 256 half_window = prediction_window // 2 num_training_examples = sum(peaks.chr != 'chr8') if num_training_examples < example_limit: raise IndexError('Only ' + str(num_training_examples) + ' training samples') print('Number of training examples: ' + str(num_training_examples)) if shifts: negative_shift = prediction_window * 4 # build intervaltrees for peaks to make sure our negatives (shifted positives) # are true negatives
def __init__(self, outpath, gen_path='/home/kal/data/ctcf_strengthgen.hdf5', genome_path='/home/kal/.ucscgenome/hg19.2bit', bed_path=None, batch_size=32): """ Initialize a new ctcf model obejct. Arguments: outpath -- path to the folder where the outputs were generated. Keywords: genpath -- path to a data generator. genome_path -- path to a genome. bed_path -- path to an already generated and annotated atac peak bed file. batch_size -- size of batches accepted by model. """ self.batch_size = batch_size # get an output direcotry self.out_dir = outpath # get the genome self.genome = ucscgenome.Genome(genome_path) # load the historys num_pk1 = len([f for f in os.listdir(outpath) if f.endswith('.pk1') and os.path.isfile(os.path.join(outpath, f))]) folder_name = os.path.basename(os.path.normpath(outpath)) history_path = os.path.join(outpath, folder_name + '_history') if num_pk1 == 1: with open(history_path + '1.pk1', 'rb') as input: self.h = pickle.load(input) self.finer_epochs = False elif num_pk1 == 3: with open(history_path + '1.pk1', 'rb') as input: self.h1 = pickle.load(input) with open(history_path + '2.pk1', 'rb') as input: self.h2 = pickle.load(input) with open(history_path + '3.pk1', 'rb') as input: self.h3 = pickle.load(input) self.finer_epochs = True print('Loaded training history.') # load the potential model paths model_paths = list() for file in os.listdir(outpath): if 'weights_3_24' in file and file.endswith(".hdf5"): model_paths.append(os.path.join(outpath, file)) # Find the model with the highest val_acc def extract_number(f): s = f.split('_')[-1].rsplit('.', maxsplit=1) return (float(s[0]) if s else -1, f) model_path = min(model_paths, key=extract_number) print('model path:' + str(model_path)) # load the model self.model = load_model(model_path, custom_objects={'Bias':Bias}) # and the layer names self.layer_dict = dict([(layer.name, layer) for layer in self.model.layers]) print('Loaded the model.') # get a data generator self.gen = ctcf_strength_gen.CTCFGeneratorhdf5(gen_path) print('Loaded the data generator.') if bed_path == None: self.generate_bed() print('Generated a bed file.') else: self.peaks = pd.read_table(bed_path, header=None) self.peaks.columns = 'chr start end ctcf pwm ml'.split() print('Loaded a bed file.') # get the constrained peaks f = open(os.path.join(self.out_dir, 'constrained_atac.bed'), 'w') subprocess.run(['bedtools', 'coverage', '-counts', '-a', os.path.join(self.out_dir, 'end_atac.bed'), '-b', '/home/kal/data/K526_atac_sorted.bed'], stdout=f) self.constrained_peaks = pd.read_table(os.path.join(self.out_dir, 'constrained_atac.bed'), header=None) self.constrained_peaks.columns = 'chr start end oldctcf pwm ml counts'.split() self.constrained_peaks['ctcf'] = self.constrained_peaks['counts'] > 0 print('Constrained the peaks') # get a subset of localized sequences self.sample_peaks = self.peaks.sample(100*self.batch_size) signal_seqs = list() for index, row in self.sample_peaks.iterrows(): this_seq, max_pred = localize(row, self.model, self.genome) signal_seqs.append(this_seq) self.sample_peaks['signal_seq'] = signal_seqs print('Localized a subset of sequences.')