Exemplo n.º 1
0
    def __init__(self):
        # read CTCF peaks
        self.peaks = pandas.read_table(FLAGS.data_path, header=None)
        self.peaks.columns = 'chr start end name score expCount expNums expScores'.split()
        self.peaks = self.peaks[self.peaks.name.isin(['CTCF'])]
        print(self.peaks.head())
        widths = self.peaks.end - self.peaks.start
        # only one of the peaks is actually not 150 wide
        print('Getting genome.')
        self.genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit')
        # positives - center of each peak with 256(defalut) bp window
        # negatives - positives shifted left/right by 1024 (masked to not overlap any positives)
        # negatives - shuffled positives sequences

        # for testing - hold out chr8

        prediction_window = FLAGS.input_window
        self.half_window = prediction_window // 2
        self.negative_shift = prediction_window * 4

        self.num_training_examples = sum(self.peaks.chr != 'chr8')
        print('Number of training examples: ' + str(self.num_training_examples))

        # build intervaltrees for peaks to make sure our negatives (shifted positives)
        # are true negatives
        print('Building itrtree')
        self.peak_intervals = {chr: IntervalTree() for chr in self.peaks.chr.unique()}
        for chr in self.peaks.chr.unique():
            self.peak_intervals[chr][len(self.genome[chr]):len(self.genome[chr])+1] = 1
        for idx, row in tqdm(self.peaks.iterrows()):
            self.peak_intervals[row.chr][(row.start - self.half_window):(row.end + self.half_window)] = 1
Exemplo n.º 2
0
    def predict_snv(self, peaks, genome=None, act=False):
        """Predict from a bed file with chr, position, refAllele, altAllele.

        Arguments:
            peaks -- the bed file in pd table form.
        Keywords:
            genome -- default is hg19.
        Outputs:
            refpreds -- predictions for each row with reference allele. 
            altpreds -- predictions for each row with alternate allele. 
        """
        # get the genome and bed file regions
        if genome == None:
            genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit')
        # predict over the rows
        refpreds = list()
        batchgen = train_TFmodel.filled_batch(snv_gen(peaks, genome,
                                                      alt=False))
        for batch in batchgen:
            if act:
                refpreds.append(self.get_act([batch, 0]))
            else:
                refpreds.append(self.model.predict_on_batch(batch))
        refpreds = np.asarray(refpreds).flatten()[:len(peaks)]

        altpreds = list()
        batchgen = train_TFmodel.filled_batch(snv_gen(peaks, genome, alt=True))
        for batch in batchgen:
            if act:
                altpreds.append(self.get_act([batch, 0]))
            else:
                altpreds.append(self.model.predict_on_batch(batch))
        altpreds = np.asarray(altpreds).flatten()[:len(peaks)]

        return refpreds, altpreds
Exemplo n.º 3
0
 def predict_bed(self, peaks, genome=None):
     """Predict from a bed file.
 
     Arguments:
         peaks -- from the bed file.
     Keywords:
          genome -- default is hg19.
     Outputs:
         preds -- predictions for each row. 
     """
     # get the genome and bed file regions
     if genome == None:
         genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit')
     # predict over the rows
     preds = list()
     for index, row in tqdm(peaks.iterrows()):
         tile, pred = self.localize(row, genome)
         preds.append(pred)
     return np.asarray(preds).flatten()
Exemplo n.º 4
0
    def __init__(self, regions='/home/kal/data/chipseq_ctcf_peaks.bed', 
cellline_CTCFs=None, cellline_DHSs=None, keys=None, input_window=FLAGS.input_window, batch_size=FLAGS.batch_size, column_names='chr start end . . . . .'):
        """ Create a generator for CTCFDHS data.

        Arguments:
            regions -- bed file with training points to use.
            cellline_CTCF -- list of CTCF peak files, one for each cell line to use.
            cellline_DHS -- list of files with DHS reads, one for each cell line to use. matching order to cellline_CTCF.
        """
        # read in the regions
        self.bed = pd.read_table(regions, header=None)
        self.bed.columns = column_names.split() 

        # set up a reference genome.
        self.genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit')

        # define some values
        self.batch_size = batch_size
        self.input_window = input_window
        self.half_window = input_window // 2

        # set up cell line stuff.
        if cellline_CTCFs == None:
            cellline_CTCFs = ['/home/kal/CTCF/ATAC_CTCF/data/K562/wgEncodeBroadHistoneK562CtcfStdAlnRep1.bam', 
            '/home/kal/CTCF/ATAC_CTCF/data/HUVEC/wgEncodeBroadHistoneHuvecCtcfStdAlnRep1.bam', 
            '/home/kal/CTCF/ATAC_CTCF/data/HCT116/CTCF_untreated.reheader.bam']
        if cellline_DHSs == None:
            cellline_DHSs = ['/home/kal/CTCF/ATAC_CTCF/data/K562/CombinedScreens.unique_alignment.bam', 
            '/home/kal/CTCF/ATAC_CTCF/data/HUVEC/wgEncodeUwDnaseHuvecAlnRep2.bam', 
            '/home/kal/CTCF/ATAC_CTCF/data/HCT116/wgEncodeUwDnaseHct116AlnRep1.bam']
        self.CTCF_dict = dict()
        self.DHS_dict = dict()
        self.cov_dict = dict()
        if keys == None:
            keys = range(len(cellline_CTCFs))
        self.keys = keys
        for key, CTCF, DHS in zip(self.keys, cellline_CTCFs, cellline_DHSs):
            self.CTCF_dict[key] = pysam.AlignmentFile(CTCF,'rb')
            self.DHS_dict[key] = pysam.AlignmentFile(DHS,'rb')
            total_bases = sum([len(self.genome[chr]) for chr in self.DHS_dict[key].references])
            covered_bases = self.DHS_dict[key].count()
            self.cov_dict[key] = covered_bases/total_bases
Exemplo n.º 5
0
        TF -- the transcription factor to filter for.
        example_limit -- the minimum number of examples to bother with.
        scrambled -- the size of the -mers to consider independent units when scrambeling.
<<<<<<< HEAD
        shift -- use shifted samples?
        score_columns -- which columns to put as the score
=======
>>>>>>> 6e0e7265b8151677f97b65e8e05edf15e0cf7599
    """
    # read TF peaks
    full = pd.read_table(bed_path, header=None)
    if columns == None:
        columns = 'chr start end name score expCount expNums expScores'
    full.columns = columns.split()
    peaks = full[full.name.isin([TF])]
    genome = ucscgenome.Genome('/home/kal/.ucscgenome/hg19.2bit')
    # positives - center of each peak with 256(defalut) bp window
    # negatives - positives shifted left/right by 1024 (masked to not overlap any positives)
    # negatives - shuffled positives sequences
    # for testing - hold out chr8
    prediction_window = 256
    half_window = prediction_window // 2
    num_training_examples = sum(peaks.chr != 'chr8')
    if num_training_examples < example_limit:
        raise IndexError('Only ' + str(num_training_examples) + ' training samples')
    print('Number of training examples: ' + str(num_training_examples))

    if shifts:
        negative_shift = prediction_window * 4
        # build intervaltrees for peaks to make sure our negatives (shifted positives)
        # are true negatives
Exemplo n.º 6
0
    def __init__(self, outpath, gen_path='/home/kal/data/ctcf_strengthgen.hdf5', genome_path='/home/kal/.ucscgenome/hg19.2bit', bed_path=None, batch_size=32):
        """ Initialize a new ctcf model obejct.

        Arguments:
           outpath -- path to the folder where the outputs were generated.

        Keywords:
            genpath -- path to a data generator.
            genome_path -- path to a genome.
            bed_path -- path to an already generated and annotated atac peak bed file.
            batch_size -- size of batches accepted by model.
        """ 
        self.batch_size = batch_size
        # get an output direcotry
        self.out_dir = outpath 

        # get the genome
        self.genome = ucscgenome.Genome(genome_path)

        # load the historys
        num_pk1 = len([f for f in os.listdir(outpath) if f.endswith('.pk1') and os.path.isfile(os.path.join(outpath, f))])
        folder_name = os.path.basename(os.path.normpath(outpath))
        history_path = os.path.join(outpath, folder_name + '_history')
        if num_pk1 == 1:
            with open(history_path + '1.pk1', 'rb') as input:
                self.h = pickle.load(input)
                self.finer_epochs = False
        elif num_pk1 == 3:
            with open(history_path + '1.pk1', 'rb') as input:
                self.h1 = pickle.load(input)
            with open(history_path + '2.pk1', 'rb') as input:
                self.h2 = pickle.load(input)
            with open(history_path + '3.pk1', 'rb') as input:
                self.h3 = pickle.load(input) 
            self.finer_epochs = True

        print('Loaded training history.')

        # load the potential model paths
        model_paths = list()
        for file in os.listdir(outpath):
            if 'weights_3_24' in file and file.endswith(".hdf5"):
                model_paths.append(os.path.join(outpath, file))
        # Find the model with the highest val_acc
        def extract_number(f):
            s = f.split('_')[-1].rsplit('.', maxsplit=1)
            return (float(s[0]) if s else -1, f)
        model_path = min(model_paths, key=extract_number) 
        print('model path:' + str(model_path))

        # load the model
        self.model = load_model(model_path, custom_objects={'Bias':Bias})
        # and the layer names
        self.layer_dict = dict([(layer.name, layer) for layer in self.model.layers])

        print('Loaded the model.')

        # get a data generator
        self.gen = ctcf_strength_gen.CTCFGeneratorhdf5(gen_path)
        print('Loaded the data generator.')

        if bed_path == None:
            self.generate_bed()
            print('Generated a bed file.')
        else:
            self.peaks = pd.read_table(bed_path, header=None)
            self.peaks.columns = 'chr start end ctcf pwm ml'.split()
            print('Loaded a bed file.')

        # get the constrained peaks
        f = open(os.path.join(self.out_dir, 'constrained_atac.bed'), 'w')
        subprocess.run(['bedtools', 'coverage',  '-counts', '-a', os.path.join(self.out_dir, 'end_atac.bed'), '-b', '/home/kal/data/K526_atac_sorted.bed'], stdout=f)
        self.constrained_peaks = pd.read_table(os.path.join(self.out_dir, 'constrained_atac.bed'), header=None)
        self.constrained_peaks.columns = 'chr start end oldctcf pwm ml counts'.split()

        self.constrained_peaks['ctcf'] = self.constrained_peaks['counts'] > 0

        print('Constrained the peaks')

        # get a subset of localized sequences
        self.sample_peaks = self.peaks.sample(100*self.batch_size)
        signal_seqs = list()
        for index, row in self.sample_peaks.iterrows():
            this_seq, max_pred = localize(row, self.model, self.genome)    
            signal_seqs.append(this_seq)
        self.sample_peaks['signal_seq'] = signal_seqs
        print('Localized a subset of sequences.')