def construct_training_data(genome_sizes_file, peaks_file, genome_fasta_file, blacklist_file, to_keep, to_filter, window_length, acc_regions_file, out_prefix, chromatin_track_list, nbins): """ This generator can either generate training data or validation data based on the to_keep and to_filter arguments. The train generate uses the to_filter argument, whereas to_keep=None For example: train_generator: to_filter=['chr10', 'chr17, 'chrUn', 'chrM', 'random'] i.e. In this construction; chr10 and chr17 can be used for testing/validation. The val generator uses the to_keep argument, whereas to_filter=None. For example: val_generator: to_keep=['chr17'] i.e. In this construction; chr17 data is used for validation. Additional Parameters: genome_sizes_file: sizes peaks_file: multiGPS formatted *events* file blacklist_file: BED format blacklist file genome_fasta_file: fasta file for the whole genome batch_size (int): batch size used for training and validation batches window_len (int): the length of windows used for training and testing. """ # Load the genome_sizes_file (Filtering out the validation and test chromosomes): curr_genome_bed = utils.get_genome_sizes(genome_sizes_file, to_keep=to_keep, to_filter=to_filter) genome_bed_df = curr_genome_bed.to_dataframe() # Loading the chip-seq bed file (Filtering out the validation and test chromosomes): chip_seq_coordinates = utils.load_chipseq_data(peaks_file, genome_sizes_file=genome_sizes_file, to_keep=to_keep, to_filter=to_filter) # Loading the exclusion bed file (Blacklist + ChIP-seq peaks, use for constructing negative sets): exclusion_windows_bdt = utils.exclusion_regions(blacklist_file, chip_seq_coordinates) exclusion_windows_df = exclusion_windows_bdt.to_dataframe() # constructing the training set construct_sets = ConstructTrainingData(genome_sizes_file=genome_sizes_file, genome_fasta_file=genome_fasta_file, blacklist_file=blacklist_file, chip_coords=chip_seq_coordinates, exclusion_df=exclusion_windows_df, window_length=window_length, curr_genome_bed=genome_bed_df, acc_regions_file=acc_regions_file, chromatin_track_list=chromatin_track_list, nbins=nbins) X_seq, X_chromatin_list, y, training_coords = construct_sets.get_data() # saving the data np.savetxt(out_prefix + '.seq', X_seq, fmt='%s') for idx, chromatin_track in enumerate(chromatin_track_list): chromatin_out_files = [x.split('/')[-1].split('.')[0] for x in chromatin_track_list] np.savetxt(out_prefix + '.' + chromatin_out_files[idx] + '.chromatin', X_chromatin_list[idx], delimiter='\t', fmt='%1.3f') np.savetxt(out_prefix + '.labels', y, fmt='%s') return training_coords
def define_coordinates(self): """ This function loads and returns coords & labels for the test set. Logic for assigning test set labels: The multiGPS peak files are used as inputs; and expanded to record 25 bp windows around the peak center. if 100% of peak center lies in window: label bound. elif < 100% of peak center lies in the window: label ambiguous. else: label unbound. Returns: test_coords (pd dataFrame): A dataFrame with chr, start, end and labels """ genome_sizes = pd.read_csv(self.genome_sizes_file, sep="\t", names=['chr', 'len']) # subset the test chromosome: genome_test = genome_sizes[genome_sizes['chr'] == self.to_keep[0]] # the assumption here is that to_keep is a single chromosome list. end_idx = genome_test.iloc[0, 1] chromosome = genome_test.iloc[0, 0] test_set = [] start_idx = 0 while start_idx + self.window_len < end_idx: curr_interval = [ chromosome, start_idx, start_idx + self.window_len ] start_idx += self.stride test_set.append(curr_interval) test_df = pd.DataFrame(test_set, columns=['chr', 'start', 'stop']) test_bdt_obj = BedTool.from_dataframe(test_df) chip_peaks = utils.load_chipseq_data( chip_peaks_file=self.peaks_file, to_keep=self.to_keep, genome_sizes_file=self.genome_sizes_file) # note: multiGPS reports 1 bp separated start and end, # centered on the ChIP-seq peak. chip_peaks['start'] = chip_peaks['start'] - int(self.window_len / 2) # (i.e. 250 if window_len=500 ) chip_peaks['end'] = chip_peaks['end'] + int(self.window_len / 2 - 1) # (i.e. 249 if window_len=500); multiGPS reports 1bp intervals chip_peaks = chip_peaks[['chr', 'start', 'end']] chip_peaks_bdt_obj = BedTool.from_dataframe(chip_peaks) blacklist_exclusion_windows = BedTool(self.blacklist_file) # intersecting unbound_data = test_bdt_obj.intersect(chip_peaks_bdt_obj, v=True) if self.blacklist_file is None: bound_data = chip_peaks_bdt_obj else: unbound_data = unbound_data.intersect(blacklist_exclusion_windows, v=True) # i.e. if there is any overlap with chip_peaks, that window is not # reported # removing blacklist windows bound_data = chip_peaks_bdt_obj.intersect( blacklist_exclusion_windows, v=True) # i.e. the entire 500 bp window is the positive window. # making data-frames bound_data_df = bound_data.to_dataframe() bound_data_df['label'] = 1 unbound_data_df = unbound_data.to_dataframe() unbound_data_df['label'] = 0 # exiting test_coords = pd.concat([bound_data_df, unbound_data_df]) return test_coords
def data_generator(genome_sizes_file, peaks_file, genome_fasta_file, blacklist_file, to_keep, to_filter, window_lenght, batch_size, acc_regions_file, ratios): """ This generator can either generate training data or validation data based on the to_keep and to_filter arguments. The train generate uses the to_filter argument, whereas to_keep=None For example: train_generator: to_filter=['chr10', 'chr17, 'chrUn', 'chrM', 'random'] i.e. In this construction; chr10 and chr17 can be used for testing/validation. The val generator uses the to_keep argument, whereas to_filter=None. For example: val_generator: to_keep=['chr17'] i.e. In this construction; chr17 data is used for validation. Additional Parameters: genome_sizes_file: sizes peaks_file: multiGPS formatted *events* file blacklist_file: BED format blacklist file genome_fasta_file: fasta file for the whole genome batch_size (int): batch size used for training and validation batches window_len (int): the length of windows used for training and testing. """ # load the genome_sizes_file: genome_bed_val = utils.get_genome_sizes(genome_sizes_file, to_keep=to_keep, to_filter=to_filter) genome_bed_df = genome_bed_val.to_dataframe() # loading the chip-seq bed file chip_seq_coordinates = utils.load_chipseq_data( peaks_file, genome_sizes_file=genome_sizes_file, to_keep=to_keep, to_filter=to_filter) def make_flanks(lower_lim, upper_lim): # getting a list of chip-seq flanking windows: # (can be a separate fn in utils) flanks_left = chip_seq_coordinates.copy() flanks_right = chip_seq_coordinates.copy() flanks_left['start'] = chip_seq_coordinates['start'] - upper_lim flanks_left['end'] = chip_seq_coordinates['start'] - lower_lim flanks_right['start'] = chip_seq_coordinates['start'] + lower_lim flanks_right['end'] = chip_seq_coordinates['start'] + upper_lim return flanks_left, flanks_right fl_r, fl_l = make_flanks(lower_lim=250, upper_lim=750) fl_r_2, fl_l_2 = make_flanks(lower_lim=200, upper_lim=700) fl_r_3, fl_l_3 = make_flanks(lower_lim=1500, upper_lim=2000) fl_r_4, fl_l_4 = make_flanks(lower_lim=1000, upper_lim=1500) flanks = pd.concat( [fl_r, fl_l, fl_r_2, fl_l_2, fl_l_3, fl_r_3, fl_r_4, fl_l_4]) # flanks_bdt_obj = BedTool.from_dataframe(flanks) # converting the df to a bedtools object inside the generator, to enable a # py-bedtools cleanup otherwise. # print(flanks_bdt_obj.head()) # flanks_bdt_obj = flanks_bdt_obj.intersect(BedTool.from_dataframe(chip_seq_coordinates), # v=True) # print(flanks_bdt_obj.head) # loading the exclusion coords: chipseq_exclusion_windows, exclusion_windows_bdt = utils.exclusion_regions( blacklist_file, chip_seq_coordinates) exclusion_windows_df = exclusion_windows_bdt.to_dataframe() # constructing the training set construct_sets = ConstructSets(genome_sizes_file=genome_sizes_file, genome_fasta_file=genome_fasta_file, blacklist_file=blacklist_file, chip_coords=chip_seq_coordinates, exclusion_df=exclusion_windows_df, window_length=window_lenght, curr_genome_bed=genome_bed_df, batch_size=batch_size, acc_regions_file=acc_regions_file, flanks=flanks, ratios=ratios) while True: X, y, coords = construct_sets.get_data() yield X, y