def preprocess_chipseq(num_jobs, bin_size): datagen = DataGenerator() processes = [] celltypes = datagen.get_celltypes() transcription_factors = datagen.get_trans_fs() for part in ['train']: with open('../data/annotations/%s_regions.blacklistfiltered.merged.bed' % part) as fin: lines = fin.read() for celltype in celltypes: for transcription_factor in transcription_factors: if not os.path.exists('../data/chipseq_fold_change_signal/ChIPseq.%s.%s.fc.signal.train.bw' % (celltype, transcription_factor)): continue fout_path = '../data/preprocess/CHIPSEQ_FEATURES/%s_%s_%d.gz' % ( celltype, transcription_factor, bin_size) if not os.path.exists(fout_path): processes.append( Process(target=parralelChIPSeqSignalProcessor, args=(lines, fout_path, celltype, transcription_factor, bin_size))) for i in range(0, len(processes), num_jobs): map(lambda x: x.start(), processes[i:i + num_jobs]) map(lambda x: x.join(), processes[i:i + num_jobs])
def preprocess_dnase(num_jobs, bin_size): datagen = DataGenerator() processes = [] celltypes = datagen.get_celltypes() for part in ['train', 'ladder', 'test']: with open('../data/annotations/%s_regions.blacklistfiltered.merged.bed' % part) as fin: lines = fin.read() for celltype in celltypes: if not os.path.exists('../data/preprocess/DNASE_FEATURES/%s_%s_%d.txt' % (celltype, part, bin_size)): fout_path = '../data/preprocess/DNASE_FEATURES/%s_%s_%d.gz' % (celltype, part, bin_size) processes.append( Process( target=parralelDNAseSignalProcessor, args=(lines, fout_path, celltype, bin_size))) num_processes = num_jobs for i in range(0, len(processes), num_processes): map(lambda x: x.start(), processes[i:i + num_processes]) map(lambda x: x.join(), processes[i:i + num_processes])